OpenVMS Source Code Demos
UTF8_ENCODE
//================================================================================================
// title : utf8_encode_xxx.c
// author : Neil Rieck
// created : 2016-03-09
// notes :
// 1) This program is my hack to convert outbound web data from Windows-1252 (a superset of
// ISO-8859-1) to UTF-8
// 2) To properly see output data, adjust your terminal emulator to display UTF-8 (this will cause
// certain input characters to be undisplayable)
//
// ver who when what
// --- --- ------ --------------------------------------------------------------------------------
// 100 NSR 160309 1. original effort
//================================================================================================
// UTF-8 encoding
// 1. RFC 2279: http://www.faqs.org/rfcs/rfc2279.html
// 2. RFC 3629: https://tools.ietf.org/html/rfc3629 (limits UTF-8 to 4 octets; some code points
// in the 21-bit address space are being used (notice the 'z' on line 4))
//
// UCS-4 range (hex) UTF-8 octet sequence (binary) Data Bits
// ------------------- ----------------------------- ---------
// 0000,0000-0000,007F 0xxxxxxx 7 bits
// 0000,0080-0000,07FF 110xxxxx 10xxxxxx 11 bits
// 0000,0800-0000,FFFF 1110xxxx 10xxxxxx 10xxxxxx 16 bits
// 0001,0000-001F,FFFF 11110zXX 10xxxxxx 10xxxxxx 10xxxxxx 21 bits (RFC limit)
// 0020,0000-03FF,FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 26 bits (invalid)
// 0400,0000-7FFF,FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 31 bits (invalid)
//=================================================================================================
#define DVLP 1 // develop: 1=program, 0=function
#if (DVLP==0)
void general_encode(char*, char*); // forward
void utf_encode(char *buffer1, char *buffer0) { // function declaration
general_encode(buffer1, buffer0);
}
#else
#define SMALL_STRING 255 //
char buffer0[SMALL_STRING+1]; //
char buffer1[SMALL_STRING+1]; //
#endif
//
#include <stdio.h> //
#include <stdlib.h> //
#include <string.h> //
//
// ansi-to-unicode
// 1. this table is used to map 32-chars from ansi (windows-1252) to unicode values
// (these 32 characters represent the differece between iso-8859-1 and ansi (windows-1252)
// 2. the first entry here represents 0x80 (128)
// 3. decimal values represent holes in the mapping (better to copy data than discard data)
// 4. references
// ref: https://en.wikipedia.org/wiki/Windows-1252#Code_page_layout
// ref: http://www.unicode.org/charts/PDF/U0000.pdf (C0 Controls and Basic Latin)
// ref: http://www.unicode.org/charts/PDF/U0080.pdf (C1 Controls and Latin-1 Supplement)
// ref: http://www.unicode.org/charts/PDF/U20A0.pdf (alternate currency stuff)
//
static const unsigned long atou[] = {
0x20ac, 129,0x201a,0x0192,0x201e,0x2026,0x2020,0x2021,0x02c6,0x2030,0x0160,0x2039,0x0152, 141,0x017d, 143, // row 1
144,0x2018,0x2019,0x20c1,0x201d,0x2022,0x2013,0x2014,0x02dc,0x2122,0x0161,0x203a,0x0153, 157,0x017e,0x0178 // row 2
};
//
// general encode buffer0 placing the result in buffer1
//
void general_encode(char *buffer1, char *buffer0) {
unsigned char ch; //
unsigned long uni; // large enough for UCS4
unsigned long msk; //
char *dst;
char *src;
//--------------------------------------------------------------------------
dst = buffer1; // paranoid copy
src = buffer0; //
#if (DVLP==1)
printf("-i-input : %s\n", buffer0); //
printf("-i-length : %d\n", strlen(buffer0)); //
#endif
*dst = '\0'; // init output buf
while (*src != '\0') { //
uni = 0; // init unicode value
ch = (unsigned char) *src++; // sample a character
if ((ch <= 0x7F) || (ch >= 0xa0)) { // if 7-bit ASCII or 8-bit ISO-8859-1
uni = ch; // character code becomes unicode value
}else{ //
uni = atou[ch-128]; // else ANSI (windows-1252)
} //
//
// convert unicode to utf-8
//
if (uni<0x80) *dst++=uni;
else if (uni<0x800) *dst++=192+uni/64, *dst++=128+uni%64;
else if (uni-0xd800u<0x800) goto utf8_error; // unsupported (see RFC)
else if (uni<0x10000) *dst++=224+uni/4096, *dst++=128+uni/64%64, *dst++=128+uni%64;
else if (uni<0x110000) *dst++=240+uni/262144, *dst++=128+uni/4096%64, *dst++=128+uni/64%64, *dst++=128+uni%64;
else goto utf8_error; // unsupported (see RFC)
//
utf8_error:
// do nothing here for now
utf8_post_processing:
*(dst+1) = '\0'; // always append a terminator
#if (DVLP==1)
if (uni>=128) {
printf("-i-ansi : %s%x\n", "0x",ch ); //
printf("-i-unicode : %s%x\n", "0x",uni); //
}
#endif
}
#if (DVLP==1)
printf("-i-output : %s\n", buffer1);
printf("-i-length : %d\n", strlen(buffer1));
printf("----------------------------------------\n");
#endif
}
#if (DVLP==1)
//==============================================================================
// main()
//==============================================================================
void main(){
printf("-i-test case: ASCII only\n"); //
sprintf(buffer0,"this is a test"); //
general_encode(buffer1,buffer0); //
//
printf("-i-test case: ISO-8859-1 (e acute)\n"); //
sprintf(buffer0,"%s%c%s" ,"this is a t",0xE9, "st"); //
general_encode(buffer1,buffer0); //
//
printf("-i-test case: ANSI (Euro symbol)\n"); //
sprintf(buffer0,"%s%c" ,"Euro Symbol: ",0x80); //
general_encode(buffer1,buffer0); //
}
#endif