OpenVMS Source Code Demos
INTERNATIONALIZATION_DEMO_101
//==================================================================================================
// title : internationalization_demo_101.c
// author : Neil Rieck
// created: 2016-03-09
// purpose: a playground to trial a few ideas
// notes : 1)"I18N" means "InternationalizatioN" (18 chars between I and N of "InternationalizatioN")
// 2) "L10N" means "LocalizatioN" (10 chars between L and N of "LocalizatioN")
// 3) version 100 should run properly on any OpenVMS/VMS system
// 4) version 101 will only run properly if optional kit "VMSI18N" is installed
//
// ver who when what
// --- --- -------- --------------------------------------------------------------------------------
// 100 NSR 20160310 1. original effort
// NSR 20160311 2. the saga continues
// 101 NSR 20160311 1. changes to demo conversions to UTF-8
//==================================================================================================
#include <stdio.h>
#include <locale.h>
#include <string.h>
#include <wchar.h>
#include <iconv.h>
#include <errno.h>
//
char *locale;
char fromcodeset[30];
char tocodeset[30];
int iconv_opened;
iconv_t iconv_struct;
char buffer0[32767];
char buffer1[32767];
int iconv_status;
unsigned int buf0len;
unsigned int buf1len;
char *buf0ptr;
char *buf1ptr;
//
// main()
//
int main(int argc, char **argv){
printf("-i-Program: %s\n",argv[0]);
//===========================================================================
// fun with locale
//===========================================================================
// 1. folder SYS$I18N_LOCALE will usually contain 2 files
// 2. installing optional kit "VMSI18N" will increase this number to 97
//---------------------------------------------------------------------------
locale = setlocale(LC_ALL,""); // probe our current setting
if (locale == NULL) {
printf("-e-could not determine the locale\n");
}else{
printf("-i-current locale: %s\n", locale); // usually will display "C"
}
//===========================================================================
// fun with iconv (international conversion)
//===========================================================================
// 1. This routine will look for, and load, this file:
// SYS$I18N_ICONV:ISO8859-1-EURO_UTF-8.ICONV
// +++++- to
// +------ separator
// ++++++++++++++------- from
// 2. Folder SYS$I18N_ICONV will usually contain 10 files (none related to UTF-8)
// 3. Installing optional kit "VMSI18N" will increase this number to 173 (46 related to UTF-8)
// 4. These character sets (from 2010) are now quite old as far as the internet is concerned. For example,
// in the real world we see Windows-1252 (also known as ANSI) supporting 32 characters not supported in
// ISO-8859-1 (the first char being "Euro" at position 0x80). The HTML5 specification specifies that all
// browsers expecting ISO-8859-1 must be prepared to handle windows-1252/ANSI so this hack will determine
// if file "ISO8859-1-EURO_UTF-8.ICONV" properly handles windows-1252ANSI symbols
//---------------------------------------------------------------------------
sprintf(fromcodeset,"ISO8859-1-EURO"); // one byte characters
sprintf(tocodeset ,"UTF-8"); // variable byte characters
iconv_opened = FALSE;
if ((iconv_struct = iconv_open(tocodeset,fromcodeset)) == (iconv_t) - 1){
switch(errno){
case EMFILE:
case ENFILE:
printf("-e-too many conversion files to open\n");
break;
case ENOMEM:
printf("-e-not enough memory\n");
break;
case EINVAL:
printf("-e-unsupported Conversion\n");
break;
default:
printf("-e-unexpected error (%ld) from iconv_open()\n",errno);
}
}else{
iconv_opened = TRUE;
printf("-i-iconv_open was sucessful\n");
}
//
// convert a string or two
//
if (iconv_opened) {
sprintf(buffer0,"Test with copyright %c and Euro %c and TM %c",0xa9, 0x80, 0x99);
//
// warnings:
// 1) "ISO8859-1-EURO_UTF-8.ICONV" is not working properly (or at least not like windows-1252/ansi as I had hoped)
//
// char windows-1250 unicode expected utf-8 resultant utf-8
// ------------ ------- -------------- ---------------
// Euro 0x80 0x20ac e2 82 ac c2 80 (wrong)
// TM 0x99 0x2122 e2 84 a2 c2 99 (wrong)
//
// Why is this wrong. UTF-8 is a unicode encoding rather than a character set (although rfc3629 restricts
// unicode to 23 bits with one noticable hole; "unicode 8" only supports 120,737 code points)
//
// 2) so we could send this output hoping for the best, or...
// 3) use iconv() to convert ISO8859-1-EURO to UCS-2, then
// write some code to remap 27 of 32 characters to unicode, then
// use iconv() to convert to UCS-2 to UTF-8
//
buf0len = strlen(buffer0); // need the size of the data to convert
buf1len = sizeof(buffer1); // need the size of the data buffer available
buf0ptr = (char*) &buffer0; // iconv requires a pointer to a pointer
buf1ptr = (char*) &buffer1; // ditto
unsigned long buf0len_b4, buf1len_b4; // hacking (because enquiring minds want to know)
buf0len_b4 = buf0len; // hacking; copy data for hacking purposes
buf1len_b4 = buf1len; // ditto
//
// size_t iconv (iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft);
//
iconv_status = iconv (iconv_struct, &buf0ptr, &buf0len, &buf1ptr, &buf1len);
printf("-i-iconv_status: %ld\n", iconv_status); //
printf("-i-buf0len : %ld -> %ld\n", buf0len_b4, buf0len); //
printf("-i-buf1len : %ld -> %ld\n", buf1len_b4, buf1len); //
for (int i=0; i<(buf1len_b4-buf1len); i++){ // peek at our data buffer
printf("-i-position: %5x data: %2x\n",i, (unsigned char) buffer1[i]);
}
}
//
// time to exit
// but do we need to do all this? Perhaps just a blind call to iconv_close(iconv_struct)
//
if (iconv_opened) {
if (iconv_close(iconv_struct) == -1) {
switch(errno){
case EBADF:
printf("-e-conversion descriptor is invalid\n");
break;
default:
printf("-e-unexpected error (%ld) from iconv_close()\n",errno);
break;
}
}
}
//---------------------------------------------------------------------------
printf("-i-exit\n");
return 1;
}