OpenVMS Source Code Demos
UNICODE_TO_ISO
function string unicode_to_iso(string src$, long debug%) ! debug is not used here
!==================================================================================================
! title : UNICODE_TO_ISO_106.FUN
! caveat1: This function is misnamed. It should have called "utf8_to_cp1252"
! purpose: Scan inbound data looking for legal UTF-8 code sequences. These are converted to unicode
! which is then mapped to cp1252 (also known as Windows-1252; also known as ANSI) which is
! a superset of ISO-8859-1
! caveat2: There are two ways to do this: Strict and Relaxed
! 1) Strict : everything above ASCII 127 must be legal UTF-8 or we throw it away
! 2) Relaxed:
! 2a. anything above ASCII 127 which is not legal UTF-8 is assumed to be cp1252 so we keep it
! 2b. any unicode value above 255 must be mapped back to the equivalent cp1252 code if possible
! otherwise we throw it away (because we only have room for one single byte)
! history:
! ver who when what
! --- --- ------ ----------------------------------------------------------------------------------
! 105 DGM 150401 1. renovation
! 106 NSR 160314 1. renovation (to match utf8_decode.c)
! NSR 170315 2. minor cleanup + documentation changes
!==================================================================================================
! UTF-8 encoding
! 1. RFC-2279: http://www.faqs.org/rfcs/rfc2279.html
! 2. RFC-3629: https://tools.ietf.org/html/rfc3629 (limits UTF-8 to 4 octets; some code points
! in the 21-bit address space are being used (notice the 'z' on line 4))
!
! UCS-4 range (hex) UTF-8 octet sequence (binary) Data Bits
! ------------------- ----------------------------- ---------
! 0000,0000-0000,007F 0xxxxxxx 7 bits
! 0000,0080-0000,07FF 110xxxxx 10xxxxxx 11 bits
! 0000,0800-0000,FFFF 1110xxxx 10xxxxxx 10xxxxxx 16 bits
! 0001,0000-001F,FFFF 11110zXX 10xxxxxx 10xxxxxx 10xxxxxx 21 bits (RFC limit)
! 0020,0000-03FF,FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 26 bits (invalid)
! 0400,0000-7FFF,FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 31 bits (invalid)
!==================================================================================================
option type=explicit !
!
declare string dst$, tst$, alt$ !
declare long uni%, tst%, alt%, i%, j%, bytes% !
!-----------------------------------------------------------------------
! main
!-----------------------------------------------------------------------
for i% = 1 to len(src$) ! scan the string
tst$ = mid$(src$, i%, 1) ! isolate tst character
tst% = asc(tst$) ! convert to ascii
if tst% <= 127 then !
dst$ = dst$ + tst$ ! copy ASCII as-is
goto get_next_char ! next iteration
end if !
!
if (tst% and X"e0") = x"c0" then ! test for: 1100-xxxx
bytes% = 2 ! this might be a 2-byte sequence (or not)
uni% = tst% and x"1f" ! keep 5-bits of octet #1
goto process_uni ! continue below
end if !
if (tst% and X"f0") = x"e0" then ! test for: 1110-xxxx
bytes% = 3 ! this might be a 3-byte sequence (or not)
uni% = tst% and x"0f" ! keep 4-bits of octet #1
goto process_uni ! continue below
end if !
if (tst% and X"f8") = x"f0" then ! test for: 1111-xxxx
bytes% = 4 ! this might be a 4-byte sequence (or not)
uni% = tst% and x"07" ! keep 3-bits of octet #1
goto process_uni ! continue below
end if !
!
! definately not unicode
!
dst$ = dst$ + tst$ ! copy cp1252 as-is
goto get_next_char ! next iteration
!
! this might be a unicode character depending upon the following bytes following
! entry: i% = points to tmp$ (first utf-8 octet)
! bytes% = expected total number of octects (2-4)
!
process_uni: !
for j% = 1 to (bytes%-1) !
alt$ = mid$(src$, i%+j%, 1) ! isolate character after tst$
alt% = asc(alt$) !
if (alt% and x"c0") = x"80" ! is this a secondary utf-8 octet? (10xx-xxxx)
then ! yes
alt% = (alt% and x"7f") ! isolate 6-bits
uni% = uni% * 64% ! shift by 6 places
uni% = uni% + alt% ! merge bits
else ! no
dst$ = dst$ + tst$ ! assume is cp1252 then copy as-is
goto get_next_char !
end if !
next j% !
!
! at this point, this appeared to be a legal UTF-8 sequence
!
!-----------------------------------------------------------
! unicode mapping to cp1252
! note: perhaps a single table lookup would be more efficient
!-----------------------------------------------------------
! unicode ref: http://unicode-table.com/en/
!
select uni% !
case <= 255 !
dst$ = dst$ + chr$(uni%) !
case 8208, 8209 ! x2010,x2011 Unicode: hyphen, non-breaking hyphen
dst$ = dst$ + chr$(45) ! ASCII : dash
case 8210, 8211 ! x2012,x2013 Unicode: Figure Dash, En Dash
dst$ = dst$ + chr$(150) ! Windows: dash
case 8212, 8213 ! x2014,x2015 Unicode: EM dash, Horizontal Bar
dst$ = dst$ + chr$(151) ! Windows: dash
case 8216 ! x2018 Unicode: left side single quote
dst$ = dst$ + chr$(145) ! Windows: left single quote
case 8217 ! x2019 Unicode: right side single quote
dst$ = dst$ + chr$(146) ! Windows: right single quote
case 8220 ! x201c Unicode: left double-quote
dst$ = dst$ + chr$(147) ! Windows: left double-quote
case 8221 ! x201d Unicode: right double-quote
dst$ = dst$ + chr$(148) ! Windows" right double-quote
case 8224 ! x2020 Unicode: dagger
dst$ = dst$ + chr$(134) ! Windows: dagger
case 8225 ! x2021 Unicode: double dagger
dst$ = dst$ + chr$(135) ! Windows: double dagger
case 8226 ! x2022 Unicode: black dot
dst$ = dst$ + chr$(149) ! Windows: black dot
case 8230 ! x2026 Unicode: three dot horizontal ellipsis
dst$ = dst$ + chr$(133) ! Windows: horizontal ellipsis
case 8248 ! x2038 Unicode: caret
dst$ = dst$ + chr$(94) ! Windows: circumflex
case 8249 ! x2039 Unicode: single less than
dst$ = dst$ + chr$(139) ! Windows: single less than
case 8250 ! x203a unicode: single greater than
dst$ = dst$ + chr$(155) ! Windows: single greater than
case 8364 ! x20ac Unicode: Euro symbol
dst$ = dst$ + chr$(128) ! Windows: Euro symbol
case else ! oops, what do we do now?
%if 1=1 %then !
dst$ = dst$ + chr$(182) ! substitute Windows: Pilcrow (funny inverted "P")
%else !
! do nothig (throw it way)
%end %if !
end select !
!-------------------------------------------------------------------
i% = i% + bytes% - 1 ! eat some chars (NEXT will eat one more)
get_next_char: !
next i% ! advance by tst
unicode_to_iso = dst$ ! pass back to called
end function ! adios
!