OpenVMS Source Code Demos

BASIC_ISO_8859_TO_ASCII

1000	%title "iso-8859-to-ascii-xxx.bas"					!
	%ident                      "Version_104.4"				! <<<---+
	declare string constant	k_version = "104.4"			,	! <<<---+					&
				k_program = "iso-8859-to-ascii"			!
	!=========================================================================================================================
	! title  : iso-8859-to-ascii-xxx.BAS
	! author : Neil Rieck (https://neilrieck.net/) (n.rieck@bell.net)
	! created: 2001-08-22
	! caveats: Terminal Emulations (2013):
	!	   1) this program appears to have some visual glitches when used with Attachmate Reflection 14.1 using default
	!		settings for VT-320 (Host Character Set: "DEC Suplemental", PC Character Set: "Windows"). Changing the
	!		host character set to "ISO-Latin-1 (8859-1)" does not improve things while "Windows Latin (1252)" does.
	!		As expected, changing host character set to "UTF-8" displays UTF pairs as one character.
	!	   2) this program did not appear to have any visual glitches when used with Tera Term 4.78 where coding defaults
	!		to "UTF-8" with codepage 65001 (a.k.a. Microsoft). However, all the 2-character UTF data is displayed as
	!		one character.
	!
	! History:
	! ver who when   what
	! --- --- ------ ---------------------------------------------------------------------------------------------------------
	! 100 NSR 020822 1. original stub (for use else where)
	! 101 NSR 081211 1. started work on integrating a unicode translator
	! 102 NSR 121009 1. bug fix in unicode decoder
	! 103 NSR 121009 1. more work on the test cases
	! 104 NSR 130829 1. improved documentation
	!		 2. changes before copying function unicode_to_iso to our external function library
	!		 3. created an iso_to_unicode function for better reverse testing
	!     NSR 130830 4. a few bug fixes in the test case 256
	!========================================================================================================================
	option type=explicit							! no kid stuff
	set no prompt								! no ? on input
	!
	external string function unicode_to_iso( string, long )			!
	external string function iso_to_unicode( string )			!
	external string function long_to_hex( long, long )			!
	!
	declare string constant	htab	= '9'C			,		! horizontal tab		&
				null	= '0'C			,		! null				&
				alpha	= "0123456789ABCDEF"			!
	!
	declare string	iso_translate$						,&
			input_data$						,&
			temp$							,&
			output_data$						,&
			choice$							,&
			junk$							,&
			junk1$							,&
			junk2$							,&
			junk3$							,&
		long	i%							,&
			debug%							,&
			junk%							,&
			column
	!
	!=======================================================================
	!	<<< init >>>
	!=======================================================================
2000	init:
	debug% = 0								!
	gosub init_iso_translator						!
	!
	!	sample data arrays
	!
	dim	string u$(160 to 255)						! unicode data
	dim	string h$(160 to 255) 						! hex data
	declare	string r256$							! raw data
	declare	string u256$							! unicode data
	dim string g$(10)							! german phrase
	!-----------------------------------------------------------------------
	!	source: http://de.wikipedia.org/wiki/Reinheitsgebot
	!
	g$(1) = 'Die erste Erw�hnung der Bezeichnung ,Reinheitsgebot" ist in einem'
	g$(2) = '!Sitzungsprotokoll des bayrischen Landtags vom 4. M�rz 1918 belegt.'
	g$(3) = '!Die Bezeichnung setzte sich jedoch erst allm�hlich durch, au�erhalb'
	g$(4) = '!Bayerns erst w�hrend des Streits um das sogenannte ,S��bier" in den'
	g$(5) = '!1950er-Jahren.  Sowohl bayrische als auch au�erbayrische Zeitungen'
	g$(6) = '!berichteten h�ufig'
	!-----------------------------------------------------------------------
	!
	!	load sample data arrays
	!
	for i% = 160 to 255							! &nbsp; to &yuml;
	    u$(i%) = iso_to_unicode(chr$(i%))					! create unicode data from chr$(i%)
	    h$(i%) = long_to_hex(i%,1)						! create hex data from i%
	next i%									!
	!
	junk$ = ""								! init
	for i% = 160 to 255							! &nbsp; to &yuml;
	    junk$ = junk$ + chr$(i%)						! build one big string
	next i%									!
	r256$= junk$								! save a copy for later
	u256$= iso_to_unicode (junk$)						! convert to unicode
	!
	!=======================================================================
	!	<<< main >>>
	!=======================================================================
3000	main:
	print k_program +"_"+ k_version						!
	print string$(len(k_program +"_"+ k_version), asc("="))			! underline previous line (how will this optimize?)
	!
	print "test-data menu: "
	column = 0								! init
	for i% = 160 to 255							! 160-255 test cases
	    print "(";								!
	    print using "<0>##"; i%;						!
	    print ") hex=";							!
	    print h$(i%);							!
	    print " utf=";u$(i%);" ";						!
	    column = column + 1							!
	    if column = 5 then							!
		column = 0							!
		print								!
	    end if								!
	next i%									!
	print									! EOL
	print "(256) raw="; r256$						!
	print "      utf="; u256$						!
	print "(257) decode German phrases"					!
	print "enter? (160-257, or raw text) ";					!
	when error in								!
	    linput choice$							!
	use									!
	    print "-e-error: "+str$(err)+" during input"			!
	end when								!
	when error in								!
	    junk% = integer(choice$)						!
	use									!
	    junk% = 999								!
	end when								!
	select junk%								!
	    case < 160								!
		goto exit_now							!
	    case 160 to 255							!
		temp$ = u$(junk%)						!
	    case 256								!
		temp$ = u256$							!
	    case 257								!
		i% = 1								! init
		while g$(i%) <> ""						! scan the array
		    junk1$ = g$(i%)						! copy and remember phrase
		    print str$(i%)+" german : "+ junk1$				! show the original phrase
		    junk2$ = iso_to_unicode( junk1$ )				! convert to unicode
		    print str$(i%)+" unicode: ";junk2$				! show it
		    junk3$ = unicode_to_iso(junk2$,0)				! convert to iso
		    print str$(i%)+" iso    : ";junk3$;				! show it (should be the same)
		    if junk1$ = junk3$ then					!
			print " (same)"						!
		    else							!
			print " (different)"					!
		    end if							!
		    i% = i% + 1							!
		next 								!
		goto exit_now							!
	    case 999								!
		temp$ = edit$( choice$, 128+16+8)				! no trailing, compress, no leading
	    case else								!
		goto exit_now							!
	end select								!
	print "-i- utf data   : "+ temp$					!
	temp$	= unicode_to_iso(temp$,debug%)					!
	print "-i- 8-bit ASCII: "+ temp$					!
	output_data$ = xlate$( temp$, iso_translate$)				! translate ISO -> ASCII
	print "-i- 7-bit ASCII: "+ output_data$					!
	exit_now:
	print "============================================================"
	goto fini								!
	!
	!========================================================================================
	!
	!	<<< init ISO-8859-1 character translator >>>
	!
	!	build an ASCII translation table
	!	notes:	1. remember that <NUL> is in position #1 of iso_translate$
	!		2. this routine converts some 8-bit characters into 7-bit via translation
	!========================================================================================
	init_iso_translator:
	!
	iso_translate$ = ""							! init
	!
    %let %paranoid = 1%								! paranoid filtering	(lowest 14 become <null>)
    %if  %paranoid = 0%								!
    %then									! normal filtering	(lowest 14 are as-is)
	for i% = 0 to 13							! build 7-bit table ( from <NUL> to <CR> )
	    iso_translate$ = iso_translate$ + chr$(i%)				! use these as-is
	next i%									!
    %else									! paranoid filtering	(lowest 14 become <null>)
	for i% = 0 to 13							! build 7-bit table ( from <NUL> to <CR> )
	    iso_translate$ = iso_translate$ + null				! change these to <null>
	next i%									!
	mid$( iso_translate$,  9 +1, 1) = htab					! restore our delimiter
	mid$( iso_translate$, 13 +1, 1) = " "					! convert <cr> to <sp>
    %end %if									!
	for i% = 14 to 31							! build 7-bit table ( from <SO> to <US> )
	    iso_translate$ = iso_translate$ + null				! change these to <null>
	next i%									!
	for i% = 32 to 127							! build 7-bit table
            iso_translate$ = iso_translate$ + chr$(i%)				! use these as-is
	next i%									!
	!
	!	now patch the lower ascii translation table as required by your application
	!
	mid$( iso_translate$, 96%  +1, 1) = "'"					! convert "`" to "'"
!~~~	mid$( iso_translate$, 123% +1, 1) = " "					x map "{"   to <sp>
!~~~	mid$( iso_translate$, 124% +1, 1) = " "					x map "}"   to <sp>
!~~~	mid$( iso_translate$, 125% +1, 1) = " "					x map "|"   to <sp>
!~~~	mid$( iso_translate$, 126% +1, 1) = null				x map "~"   to <null>
	mid$( iso_translate$, 127% +1, 1) = null				! map <DEL> to <null>
	!
	!	this area contains the second half of the 8 bit character set
	!
	for i% = 128 to 159							!
	    iso_translate$ = iso_translate$ + null				! change these to <null>
	next i%									!
	for i% = 160 to 255							!
	    iso_translate$ = iso_translate$ + " "				! change these to <sp>
	next i%									!
	!
	!	now patch the upper ascii translation table (a subjective translation of ISO-8859-1)
	!
	mid$( iso_translate$, 160 +1, 1) = " "					! non-break space
	mid$( iso_translate$, 161 +1, 1) = "!"					! inverted exclamation
	mid$( iso_translate$, 162 +1, 1) = "c"					! cents sign
	mid$( iso_translate$, 163 +1, 1) = "$"					! pound sign
	mid$( iso_translate$, 164 +1, 1) = "$"					! currency sign
	mid$( iso_translate$, 165 +1, 1) = "$"					! yen sign
	!
	mid$( iso_translate$, 169 +1, 1) = "c"					! copyright sign
	!
	mid$( iso_translate$, 171 +1, 1) = "<"					! left angle quotes
	!
	mid$( iso_translate$, 174 +1, 1) = "r"					! registered trademark
	!
	mid$( iso_translate$, 177 +1, 1) = "+"					! plus-minus sign
	!
	mid$( iso_translate$, 180 +1, 1) = "'"					! spacing acute
	mid$( iso_translate$, 181 +1, 1) = "u"					! mirco sign
	!
	mid$( iso_translate$, 183 +1, 1) = "."					! middle dot
	mid$( iso_translate$, 184 +1, 1) = ","					! spacing cedilla
	!
	mid$( iso_translate$, 187 +1, 1) = ">"					! right angle quotes
	!
	mid$( iso_translate$, 191 +1, 1) = "?"					! inverted question mark
	mid$( iso_translate$, 192 +1, 1) = "A"					! A	grave
	mid$( iso_translate$, 193 +1, 1) = "A"					!	acute
	mid$( iso_translate$, 194 +1, 1) = "A"					!	circumflex
	mid$( iso_translate$, 195 +1, 1) = "A"					!	tilde
	mid$( iso_translate$, 196 +1, 1) = "A"					!	diaresis
	mid$( iso_translate$, 197 +1, 1) = "A"					!	ring
	mid$( iso_translate$, 198 +1, 1) = "A"					!	ligature
	mid$( iso_translate$, 199 +1, 1) = "C"					! C	cedilla
	mid$( iso_translate$, 200 +1, 1) = "E"					! E	grave
	mid$( iso_translate$, 201 +1, 1) = "E"					!	acute
	mid$( iso_translate$, 202 +1, 1) = "E"					!	circumflex
	mid$( iso_translate$, 203 +1, 1) = "E"					!	diaresis
	mid$( iso_translate$, 204 +1, 1) = "I"					! I	grave
	mid$( iso_translate$, 205 +1, 1) = "I"					!	acute
	mid$( iso_translate$, 206 +1, 1) = "I"					!	circumflex
	mid$( iso_translate$, 207 +1, 1) = "I"					!	diaresis
	mid$( iso_translate$, 208 +1, 1) = "D"					! ETH
	mid$( iso_translate$, 209 +1, 1) = "N"					! N	tilde
	mid$( iso_translate$, 210 +1, 1) = "O"					! O	grave
	mid$( iso_translate$, 211 +1, 1) = "O"					!	acute
	mid$( iso_translate$, 212 +1, 1) = "O"					!	circumflex
	mid$( iso_translate$, 213 +1, 1) = "O"					!	tilde
	mid$( iso_translate$, 214 +1, 1) = "O"					!	diaresis
	mid$( iso_translate$, 215 +1, 1) = "x"					! Multiply Sign
	mid$( iso_translate$, 216 +1, 1) = "O"					! O	slash
	mid$( iso_translate$, 217 +1, 1) = "U"					! U	grave
	mid$( iso_translate$, 218 +1, 1) = "U"					!	acute
	mid$( iso_translate$, 219 +1, 1) = "U"					!	circumflex
	mid$( iso_translate$, 220 +1, 1) = "U"					!	diaresis
	mid$( iso_translate$, 221 +1, 1) = "Y"					! Y	acute
	mid$( iso_translate$, 222 +1, 1) = "p"					! thorn
	mid$( iso_translate$, 223 +1, 1) = "B"					! sharp s
	mid$( iso_translate$, 224 +1, 1) = "a"					! a	grave
	mid$( iso_translate$, 225 +1, 1) = "a"					!	acute
	mid$( iso_translate$, 226 +1, 1) = "a"					!	circumflex
	mid$( iso_translate$, 227 +1, 1) = "a"					!	tilde
	mid$( iso_translate$, 228 +1, 1) = "a"					!	diaeresis
	mid$( iso_translate$, 229 +1, 1) = "a"					!	ring
	mid$( iso_translate$, 230 +1, 1) = "a"					!	ligature
	mid$( iso_translate$, 231 +1, 1) = "c"					! c	cedilla
	mid$( iso_translate$, 232 +1, 1) = "e"					! e	grave
	mid$( iso_translate$, 233 +1, 1) = "e"					!	acute
	mid$( iso_translate$, 234 +1, 1) = "e"					!	circumflex
	mid$( iso_translate$, 235 +1, 1) = "e"					!	diaeresis
	mid$( iso_translate$, 236 +1, 1) = "i"					! i	grave
	mid$( iso_translate$, 237 +1, 1) = "i"					!	acute
	mid$( iso_translate$, 238 +1, 1) = "i"					!	circumflex
	mid$( iso_translate$, 239 +1, 1) = "i"					!	diaeresis
	mid$( iso_translate$, 240 +1, 1) = "o"					! eth
	mid$( iso_translate$, 241 +1, 1) = "n"					! n	tilde
	mid$( iso_translate$, 242 +1, 1) = "o"					! o	grave
	mid$( iso_translate$, 243 +1, 1) = "o"					!	acute
	mid$( iso_translate$, 244 +1, 1) = "o"					!	circumflex
	mid$( iso_translate$, 245 +1, 1) = "o"					!	tilde
	mid$( iso_translate$, 246 +1, 1) = "o"					!	diaeresis
	mid$( iso_translate$, 247 +1, 1) = "/"					! division sign
	mid$( iso_translate$, 248 +1, 1) = "o"					! o	slash
	mid$( iso_translate$, 249 +1, 1) = "u"					! u	grave
	mid$( iso_translate$, 250 +1, 1) = "u"					!	acute
	mid$( iso_translate$, 251 +1, 1) = "u"					!	circumflex
	mid$( iso_translate$, 252 +1, 1) = "u"					!	diaeresis
	mid$( iso_translate$, 253 +1, 1) = "y"					! y	acute
	mid$( iso_translate$, 254 +1, 1) = "b"					! thorn
	mid$( iso_translate$, 255 +1, 1) = "y"					! y	diaeresis
	return									!
	!=======================================================================
	!	<<< adios... >>>
	!=======================================================================
31000	fini:									!
	end									!
	!
	!########################################################################################################################
	!
	!======================================================================================
	! Title	: unicode to iso
	! note	: UTF-8 encoding (see RFC 2279) http://www.faqs.org/rfcs/rfc2279.html
	! entry : string data to scan
	! return: resultant string
	!
	! UCS-4 range         UTF-8 octet sequence (binary)				payload data
	! ------------------- -----------------------------				------------
	! 0000,0000-0000,007F 0xxxxxxx							 7-data bits
	! 0000,0080-0000,07FF 110xxxxx 10xxxxxx						11-data bits
	! 0000,0800-0000,FFFF 1110xxxx 10xxxxxx 10xxxxxx				16-data bits
	! 0001,0000-001F,FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx			21-data bits
	! 0020,0000-03FF,FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx		26-data bits
	! 0400,0000-7FFF,FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx	31-data bits
	!=======================================================================================
32000	function string unicode_to_iso( string inbound$, long debug% )		!
	option type=explicit							!
	declare string	cpy$,				&
			temp$,				&
		long	i%, j%, k%, z%
	!
	cpy$ = inbound$								! copy original data
	for i% = 1 to len(cpy$)							! scan the string
	    j% = asc(mid$(cpy$,i%,1))						! test current character
	    !
	    !	7-bit test
	    !
	    if (j% and B"10000000"L) = 0 then					!
		goto next_character						! leave "as-is"
	    end if								!
	    !
	    !	2-character test	(first 3-bits of byte-1 must be "110")
	    !				(first 2-bits of byte-2 must be "10" )
	    !
	    if (j% and B"11100000"L) = 192 then					! must only be 110xxxxx
		!
		!	2-character transformations
		!	byte-1   byte-2
		!	110xxxxx 10xxxxxx
		!
		j% = j% and B"11111"L						! keep lower 5 bits for processing
		j% = j% * 64%							! shift left by 6 places (prep for merge)
		k% = asc(mid$(cpy$,i%+1,1))					! grab next character
		if (k% and B"11000000"L) <> 128 then				! must only be 10xxxxxx
		    select debug%						!
			case 0							!
			case 1 to 90						!
			    print "-e-unicode sanity error"			!
			    print "-i-character: "+str$(i%+1)			!
			case 91 to 109						!
			    when error in					!
				print #debug%,"-e-unicode sanity error"		!
				print #debug%,"-i-character: "+str$(i%+1)	!
			    use							!
			    end when						!
		    end select							!
		    goto next_character						!
		end if								!
		k% = k% and B"111111"L						! keep lower 6 bits for processing
		z% = j% or k%							! merge the bits
		mid$(cpy$,i%  ,1) = chr$((z% and B"111100000000"L)/256%)	! process and write back to char+0
		mid$(cpy$,i%+1,1) = chr$( z% and B"11111111"L)	    		! process and write back to char+1
		i% = i% + 1							! fix pointer
	    end if								!
	    !
	    !	3-character test (not yet required on our system)
	    !
	    if (j% and B"11110000"L) = 224 then					! keep 4 highest bits for group test
	    end if								!
	    !
	    !	4-character test (not yet required on our system)
	    !
	    if (j% and B"11111000"L) = 240 then					! keep 5 highest bits for group test
	    end if								!
	    !
	    !	5-character test (not yet required on our system)
	    !
	    if (j% and B"11111100"L) = 248 then					! keep 6 highest bits for group test
	    end if								!
	    !
	    !	7-character test (not yet required on our system)
	    !
	    if (j% and B"11111110"L) = 252 then					! keep 6 highest bits for group test
	    end if								!
	    !
	    !	if we get here then the character was a natural 8-bit byte
	    !
	    next_character:							!
	next i%									!
	!
	cpy$ = edit$(cpy$, 4)							! remove control characters
	if cpy$ <> inbound$  then						!
	    select debug%							!
		case 0								!
		case 1 to 90							!
		    print "==============================================="	!
		    print "-i- UTF-8      : "+ inbound$				!
		    print "-i- ISO-8859-1 : "+ cpy$				!
		    print "==============================================="	!
		case 91 to 109							!
		    when error in
			print #debug%,"-i- UTF-8      : "+ inbound$		!
			print #debug%,"-i- ISO-8859-1 : "+ cpy$			!
		    use								!
		    end when							!
	    end select								!
	end if									!
	!
	function_exit:								!
	unicode_to_iso = cpy$							!
	end function								!
	!
	!=======================================================================================
	! Title	: iso to unicode
	! note	: UTF-8 encoding (see RFC 2279) http://www.faqs.org/rfcs/rfc2279.html
	! entry : string data to scan
	! return: resultant string
	!
	! UCS-4 range (hex.)  UTF-8 octet sequence (binary)					4-bit nibble	payload data
	! ------------------- -----------------------------					------------	------------
	! 0000 0000-0000 007F 0xxxxxxx							 			 7-data bits
	! 0000 0080-0000 07FF 110xxxxx 10xxxxxx							0xCx-0xDx	11-data bits
	! 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx					0xEx		16-data bits
	! 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx				0xFx		21-data bits
	! 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx			0xFx		26-data bits
	! 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx	0xFx		31-data bits
	!=======================================================================================
32010	function string iso_to_unicode( string inbound$ )			!
	option type=explicit							!
	declare string	out$,				&
			temp$,				&
		long	i%, j%, k%, l%
	!
	out$ = ""								! init
	for i% = 1 to len(inbound$)						! scan the string
	    j% = asc(mid$(inbound$,i%,1))					! test current character
	    select j%								!
		case 0 to 127							! must be 7-bit data
		    out$ = out$ + chr$(j%)					! use "as-is"
		case 128 to 255							!
		    l% = j% and B"00111111"L					! isolate lower 6-bits for processing
		    k% = j% and B"11000000"L					! isolate upper 2-bits for processing
		    k% = k% / 64						! shift right by 6 bits
		    out$ = out$ + chr$(X"c0"L or k%) + chr$(128% or l%)		! create unicode data from c2
	     end select								!
	next i%									!
	!
	function_exit:								!
	iso_to_unicode = out$							!
	end function								!

	!========================================================================
	!	long_to_hex()
	!	entry:	inbound%	(data to convert)
	!		size%		(size in bytes)
	!========================================================================
32020	function string long_to_hex( long inbound%, long size% )
	option type=explicit
	declare string constant	alpha	= "0123456789ABCDEF"			!
	declare long	i%, z%, temp%						!
	declare string	junk$
	!
	temp% = inbound%							!
	for i%= 1 to (size% * 2)						!
	    z% = (temp% and 15%)						! isolate last nibble
	    junk$ = mid$(alpha,z%+1,1) + junk$					!
	    temp% = temp% / 16%							! shift by four bits
	next i%
	!
	function_exit:								!
	long_to_hex = junk$							!
	end function								!
	!========================================================================