OpenVMS Source Code Demos
UTF8_DECODE
//================================================================================================
// title : utf8_decode_xxx.c
// author : Neil Rieck
// created : 2016-02-23
// notes :
// 1) this program is my hack to convert inbound web data from UTF-8 to Windows-1252 (a superset
// of ISO-8859-1)
// 2) I did it this way because we are seeing a lot of malformed data hitting our site and I
// wanted a solution that would decode legal UTF-8 sequences but just copy everything else.
// We see this relaxed processing all the time in terminal emulators (this could be disabled
// if required)
// 3) Version 100 of this program discarded Unicode code points above 255 so in version 101 I
// added logic which would map code points 256-384 back to ASCII (this could be disabled if
// required)
// 4) If your emulator will not display certain Windows-1252 characters like the Euro symbol,
// try setting the emulator code page to "65001" or "windows-1252" or "cp1252" or "ANSI"
//
// ver who when what
// --- --- ------ --------------------------------------------------------------------------------
// 100 NSR 160223 1. original effort
// 101 NSR 160223 1. added logic to substitute (rather than discard) some codes above 255
// 102 NSR 160308 1. began adding more mappings to table utoa[]
// 160309 1. added more characters to table utoa[]
// 2. added support for unicode-to-ansi ver_102.2
//================================================================================================
// UTF-8 encoding
// 1. RFC 2279: http://www.faqs.org/rfcs/rfc2279.html
// 2. RFC 3629: https://tools.ietf.org/html/rfc3629 (limits UTF-8 to 4 octets; some code points
// in the 21-bit address space are being used (notice the 'z' on line 4))
//
// UCS-4 range (hex) UTF-8 octet sequence (binary) Data Bits
// ------------------- ----------------------------- ---------
// 0000,0000-0000,007F 0xxxxxxx 7 bits
// 0000,0080-0000,07FF 110xxxxx 10xxxxxx 11 bits
// 0000,0800-0000,FFFF 1110xxxx 10xxxxxx 10xxxxxx 16 bits
// 0001,0000-001F,FFFF 11110zXX 10xxxxxx 10xxxxxx 10xxxxxx 21 bits (RFC limit)
// 0020,0000-03FF,FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 26 bits (invalid)
// 0400,0000-7FFF,FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 31 bits (invalid)
//=================================================================================================
#define MAX_OCTETS 4 // supported: 4,5,6
#define DVLP 1 // develop: 1=program, 0=function
#define RELAXED 1 //
#if (DVLP==0)
void general_decode(char*,char*); // forward
void utf_decode(char *buffer1, char *buffer0) { // function declaration
general_decode(buffer1, buffer0);
}
#else
#define SMALL_STRING 255 //
char buffer0[SMALL_STRING+1]; //
char buffer1[SMALL_STRING+1]; //
#endif
//
#include <stdio.h> //
#include <stdlib.h> //
#include <string.h> //
//
// unicode->ansi (last resort mapping)
//
// 1. Rather than discard unicode code points above 255, we will remap some of them to simple ASCII
// 2. I only use the first letter of any ligature
// 3. This table starts at code point 0x0100 (unicode 256)
//
static const unsigned char utoa[] = {
//
// All 8 rows of Latin Extended A (all mapped back to ASCII)
// ref: http://www.unicode.org/charts/PDF/U0100.pdf
//
'A' ,'a' ,'A' ,'a' ,'A' ,'a' ,'C' ,'c' ,'C' ,'c' ,'C' ,'c' ,'C' ,'c' ,'D' ,'d' , // row 1
'D' ,'d' ,'E' ,'e' ,'E' ,'e' ,'E' ,'e' ,'E' ,'e' ,'E' ,'e' ,'G' ,'g' ,'G' ,'g' , // row 2
'G' ,'g' ,'G' ,'g' ,'H' ,'h' ,'H' ,'h' ,'I' ,'i' ,'I' ,'i' ,'I' ,'i' ,'I' ,'i' , // row 3
'I' ,'i' ,'I' ,'i' ,'J' ,'j' ,'K' ,'k' ,'k' ,'L' ,'l' ,'L' ,'l' ,'L' ,'l' ,'L' , // row 4
'l' ,'L' ,'l' ,'N' ,'n' ,'N' ,'n' ,'N' ,'n' ,'n' ,'N' ,'n' ,'O' ,'o' ,'O' ,'o' , // row 5
'O' ,'o' ,'O' ,'o' ,'R' ,'r' ,'R' ,'r' ,'R' ,'r' ,'S' ,'s' ,'S' ,'s' ,'S' ,'s' , // row 6
'S' ,'s' ,'T' ,'t' ,'T' ,'t' ,'T' ,'t' ,'U' ,'u' ,'U' ,'u' ,'U' ,'u' ,'U' ,'u' , // row 7
'U' ,'u' ,'U' ,'u' ,'W' ,'w' ,'Y' ,'y' ,'Y' ,'Z' ,'z' ,'Z' ,'z' ,'Z' ,'z' ,'f' , // row 8
//
// All 13 rows of Latin Extended B (most mapped back to ASCII; one mapped to ANSI)
// ref: http://www.unicode.org/charts/PDF/U0180.pdf
//
'b' ,'B' ,'b' ,'b' ,'b' ,'b' ,'C' ,'C' ,'c' ,'D' ,'D' ,'d' ,'d' ,'q' ,'E' ,'e' , // row 1
'e' ,'F' ,'f' ,'G' ,'V' ,'h' ,'l' ,'I' ,'K' ,'k' ,'l' ,'y' ,'W' ,'N' ,'n' ,'O' , // row 2
'O' ,'o' ,'D' ,'d' ,'P' ,'p' ,'R' ,'S' ,'s' ,'Z' ,'f' ,'t' ,'T' ,'f' ,'T' ,'U' , // row 3
'u' ,'U' ,'V' ,'Y' ,'y' ,'Z' ,'z' ,'3' ,'3' ,'3' ,'3' ,'2' ,'5' ,'5' ,'t' ,'p' , // row 4
'|' ,'|' ,'|' ,'!' ,'D' ,'D' ,'d' ,'L' ,'L' ,'l' ,'N' ,'N' ,'n' ,'A' ,'a' ,'I' , // row 5
'i' ,'O' ,'o' ,'U' ,'u' ,'U' ,'u' ,'U' ,'u' ,'U' ,'u' ,'U' ,'u' ,'e' ,'A' ,'a' , // row 6
'A' ,'a' ,'A' ,'a' ,'G' ,'g' ,'G' ,'g' ,'K' ,'k' ,'Q' ,'q' ,'Q' ,'q' ,'3' ,'3' , // row 7
'J' ,'D' ,'D' ,'d' ,'G' ,'g' ,'H' ,'P' ,'N' ,'n' ,'A' ,'a' ,'A' ,'a' ,'0' ,'0' , // row 8
'A' ,'a' ,'A' ,'a' ,'E' ,'e' ,'E' ,'e' ,'I' ,'i' ,'I' ,'i' ,'O' ,'o' ,'O' ,'o' , // row 9
'R' ,'r' ,'R' ,'r' ,'U' ,'u' ,'U' ,'u' ,'S' ,'s' ,'T' ,'t' ,'3' ,'3' ,'H' ,'h' , // row 10
'N' ,'d' ,'8' ,'8' ,'Z' ,'z' ,'A' ,'a' ,'E' ,'e' ,'O' ,'o' ,'O' ,'o' ,'O' ,'o' , // row 11
'O' ,'o' ,'Y' ,'y' ,'l' ,'n' ,'t' ,'J' ,'d' ,'p' ,'A' ,'C' ,0xa2,'L' ,'T' ,'s' , // row 12
'z' ,'?' ,'c' ,'B' ,'U' ,'A' ,'E' ,'e' ,'J' ,'j' ,'Q' ,'q' ,'R' ,'r' ,'Y' ,'y' , // row 13
//
// All 6 rows of IPA extensions (most mapped back to ASCII; a few mapped back to ANSI)
// ref: http://www.unicode.org/charts/PDF/U0250.pdf
//
'r' ,'a' ,'a' ,'g' ,'c' ,'c' ,'p' ,'d' ,'e' ,'e' ,'e' ,'3' ,'3' ,'3' ,'B' ,'J' , // row 1
'g' ,'g' ,'G' ,'V' ,'v' ,'h' ,'h' ,'h' ,'i' ,'l' ,'I' ,'l' ,'l' ,'l' ,'l' ,'m' , // row 2
'm' ,'m' ,'n' ,'n' ,'N' ,'0' ,'D' ,'w' ,'o' ,'r' ,'r' ,'r' ,'r' ,'r' ,'j' ,'J' , // row 3
'R' ,'R' ,'S' ,'f' ,'f' ,'f' ,'f' ,'t' ,'t' ,'u' ,'o' ,'v' ,'v' ,'w' ,'y' ,'Y' , // row 4
'Z' ,'z' ,'3' ,'3' ,'?' ,'?' ,0xbf,'C' ,'O' ,'B' ,'b' ,'G' ,'H' ,'j' ,'k' ,'L' , // row 5
'q' ,'?' ,'?' ,'d' ,'d' ,'d' ,'t' ,'t' ,'t' ,'f' ,'l' ,'l' ,'w' ,']' ,'h' ,'h' // row 6
};
//
// general decode buffer0 while copying to buffer1
//
void general_decode(char *buffer1, char *buffer0) {
long s0, s1; // subscripts
long remain; //
long bytes; //
unsigned char ch; //
unsigned long uni; // large enough for UCS4
unsigned long msk; //
//--------------------------------------------------------------------------
remain = strlen(buffer0); // get the length
#if (DVLP==1)
printf("-i-input : %s\n", buffer0); //
printf("-i-length : %d\n", remain); //
#endif
s0 = 0; // init subscript 0
s1 = 0; // init subscript 1
buffer1[s1] = '\0'; // init output buf
while (buffer0[s0] != '\0') { //
uni = 0; //
ch = buffer0[s0]; // sample a character
if (ch <= 0x7F) { // if 7-bit ASCII
bytes = 1; // just copy
}
else if ((ch & 0xE0) == 0xC0) { // 110X XXXX
uni = ch & 0x1f; // isolate 5 bits
bytes = 2; //
}
else if ((ch & 0xF0) == 0xE0) { // 1110 XXXX
uni = ch & 0x0f; // isolate 4 bits
bytes = 3; //
}
else if ((ch & 0xF8) == 0xF0) { // 1111 0XXX
uni = ch & 0x07; // isolate 3 bits
bytes = 4; //
}
#if (MAX_OCTETS>=5) // strict UTF-8 will never use this
else if ((ch & 0xFC) == 0xF8) { // 1111 10XX
uni = ch & 0x03; // isolate 2 bits
bytes = 5; //
}
#endif
#if (MAX_OCTETS>=6) // strict UTF-8 will never use this
else if ((ch & 0xFE) == 0xFC) { // 1111 110X
uni = ch & 0x01; // isolate 1 bit
bytes = 6; //
}
#endif
else { // optional (copy as-is)
bytes = 1; //
}
//
// optional multi-byte processing
// note: next byte(s) must be coded as 10xx xxxx or the encoding can't be called utf-8
// if the next byte is not a utf-8 continuation character then just copy as-is
// s0 has not yet moved so start using offset 1
//
// printf("bytes: %d remain: %d\n",bytes,remain);
if ((bytes>1) && (bytes<=remain)) { //
for (int i=1; i < bytes; i++) { // remember to start at byte one
ch = buffer0[s0+i]; // sample a character (speculative)
// printf("i: %d x: %c\n",bytes,ch);
if ((ch & 0xC0) == 0x80) { // 10XX XXXX
msk = 0; //
msk = ch & 0x7F; // isolate 7 bit;
uni = uni * 64; // shift by 6 places
uni = uni + msk; // merge bits
}else{ // not the correct bit pattern
// printf("oops: %d\n",ch);
bytes = 1; // so default to copy as-is
goto handle_single_byte;
}
}
// at this point "uni" is the unicode code point value but we only have room for one byte in windows-1252
// so I will map what I can and replace or discard as I see fit (in computing we never throw anything away
//
// caveat: I am assuming that the encoding is legal UNICODE and not windows-1252 which was run through a UTF-8
// encoder without being first remapped.
//
#if (DVLP==1)
printf("-i-unicode : %d\n", uni); //
#endif
#if (RELAXED==1)
if (uni<=127) { // illegal unicode (security risk)
buffer1[s1++] = uni; // but copy as ascii anyway
goto continue_processing; //
}
#endif
if (uni<160) { // if a legal unicode control code
buffer1[s1++] = 'X'; // then replace with an 'X'
goto continue_processing; //
}
else if (uni<=255) { // if legal unicode
buffer1[s1++] = uni; // then copy as if iso-8859-1
}
else{
// special mapping for windows-1252 (a superset of iso-8859-1)
// ref: https://en.wikipedia.org/wiki/Windows-1252#Code_page_layout
// ref: http://www.unicode.org/charts/PDF/U0000.pdf (C0 Controls and Basic Latin)
// ref: http://www.unicode.org/charts/PDF/U0080.pdf (C1 Controls and Latin-1 Supplement)
// ref: http://www.unicode.org/charts/PDF/U20A0.pdf (alternate currency stuff)
//
switch(uni) { // ver_102.2
case 0x20ac: // euro (unicode)
buffer1[s1++] = 128; // euro (ansi)
break; //
case 0x201a: //
buffer1[s1++] = 130; //
break; //
case 0x192: //
buffer1[s1++] = 131; //
break; //
case 0x201e: //
buffer1[s1++] = 132; //
break; //
case 0x2026: //
buffer1[s1++] = 133; //
break; //
case 0x2020: //
buffer1[s1++] = 134; //
break; //
case 0x2021: //
buffer1[s1++] = 135; //
break; //
case 0x02c6: //
buffer1[s1++] = 136; //
break; //
case 0x2030: //
buffer1[s1++] = 137; //
break; //
case 0x0160: //
buffer1[s1++] = 138; //
break; //
case 0x2039: //
buffer1[s1++] = 139; //
break; //
case 0x0152: //
buffer1[s1++] = 140; //
break; //
case 0x017d: //
buffer1[s1++] = 142; //
break; //
case 0x2018: //
buffer1[s1++] = 145; //
break; //
case 0x2019: //
buffer1[s1++] = 146; //
break; //
case 0x201c: //
buffer1[s1++] = 147; //
break; //
case 0x201d: //
buffer1[s1++] = 148; //
break; //
case 0x2022: //
buffer1[s1++] = 149; //
break; //
case 0x2013: //
buffer1[s1++] = 150; //
break; //
case 0x2014: //
buffer1[s1++] = 151; //
break; //
case 0x02dc: //
buffer1[s1++] = 152; //
break; //
case 0x2122: //
buffer1[s1++] = 153; //
break; //
case 0x0161: //
buffer1[s1++] = 154; //
break; //
case 0x203a: //
buffer1[s1++] = 155; //
break; //
case 0x0153: //
buffer1[s1++] = 156; //
break; //
case 0x017e: //
buffer1[s1++] = 158; //
break; //
case 0x0178: //
buffer1[s1++] = 159; //
break; //
//
// special unicode mapping
//
case 0x20a4: // Lyra (unicode)
buffer1[s1++] = 0xa4; // british pound (ansi)
break; //
default: //
if ((uni-256)<=sizeof(utoa)) { // if somewhere in this table
buffer1[s1++] = utoa[uni-256]; // then remap using utoa[]
}else{ //
buffer1[s1++] = 'Z'; // replace with a 'Z'
} //
}
}
continue_processing:
s0 = s0 + bytes; // advance source pointer by bytes
buffer1[s1 ] = '\0'; // and terminate
#if (DVLP==1)
printf("-i-ansi : %u\n", (unsigned char) buffer1[s1-1]); //
#endif
}else{
bytes = 1; // force one-byte copy as-is
}
handle_single_byte:; //
if (bytes==1) { //
buffer1[s1++] = buffer0[s0++]; // copy
buffer1[s1 ] = '\0'; // and terminate
} //
remain = remain - bytes; // update remaining count
}
#if (DVLP==1)
if (remain!=0)
printf("-e-final value for remain: %d\n",remain);
printf("-i-output : %s\n", buffer1);
printf("-i-length : %d\n", strlen(buffer1));
printf("----------------------------------------\n");
#endif
}
#if (DVLP==1)
//==============================================================================
// main()
//==============================================================================
void main(){
printf("-i-test case: ASCII only\n"); //
sprintf(buffer0,"this is a test"); //
general_decode(buffer1,buffer0); //
//
printf("-i-test case: ISO-8859-1 (e acute)\n"); //
sprintf(buffer0,"%s%c%s" ,"this is a t",0xE9, "st"); //
general_decode(buffer1,buffer0); //
//
printf("-i-test case: UTF-8 (e acute)\n"); //
sprintf(buffer0,"%s%c%c%s" ,"this is a t",0xC3,0xA9, "st"); //
general_decode(buffer1,buffer0); //
//
printf("-i-test case: mixed (two e acute; one ISO and one UTF-8; probably illegal)\n");
sprintf(buffer0,"%s%c%c%c%s" ,"this is a t",0xE9,0xC3,0xA9,"st"); // mixed (probably illegal)
general_decode(buffer1,buffer0); //
//
printf("-i-test case: UTF-8 on last char of string (boundary check)\n");//
sprintf(buffer0,"%s%c%c" ,"inverted question: ",0xC2,0xBF); // UTF-8 on last char (boundary check)
general_decode(buffer1,buffer0); //
//
// table: latin-a
//
printf("-i-test case: UTF-8> 0xC4,0x80 = 0x100\n"); //
sprintf(buffer0,"%s%c%c" ,"should map to 'A': ",0xC4,0x80); // UTF-8 (this should map to 'A')
general_decode(buffer1,buffer0); //
//
printf("-i-test case: UTF-8> 0xC5,0xbf = 0x17f\n"); //
sprintf(buffer0,"%s%c%c" ,"should map to 'f': ",0xC5,0xbf); // UTF-8 (this should map to 'f')
general_decode(buffer1,buffer0); //
//
// table: latin-b
//
printf("-i-test case: UTF-8> 0xC6,0x80 = 0x180\n"); //
sprintf(buffer0,"%s%c%c" ,"should map to 'b': ",0xC6,0x80); // UTF-8 (this should map to 'b')
general_decode(buffer1,buffer0); //
//
printf("-i-test case: UTF-8> 0xC9,0x8f = 0x24f\n"); //
sprintf(buffer0,"%s%c%c" ,"should map to 'y': ",0xC9,0x8f); // UTF-8 (this should map to 'y')
general_decode(buffer1,buffer0); //
//
// special case(s)
//
printf("-i-test case: UTF-8> 0xE2, 0x82, 0xAC = Euro Symbol\n");
sprintf(buffer0,"%s%c%c%c" ,"should map to euro symbol: ", 0xE2, 0x82, 0xAC);
general_decode(buffer1,buffer0);
}
#endif