function long mixed_to_unicode(string inbound$, long uni%()) ! !================================================================================================== ! title : mixed_to_unicode.fun ! purpose: Scan inbound data looking for legal UTF-8 code sequences. These are converted to unicode ! which is then mapped to cp1252 (also known as Windows-1252; also known as ANSI) which is ! a superset of ISO-8859-1 ! caveat1: There are two ways to do this: Strict and Relaxed ! 1) Strict : everything above ASCII 127 must be legal UTF-8 or we throw it away ! 2) Relaxed: anything above ASCII 127 which is not legal UTF-8 is assumed to be cp1252 so ! must be mapped to unicode or thrown away ! caveat2: notice that the second parameter is an array of long ! we translate from inbound$ to this array ! This program can be used in conjunction with function unicode_to_utf() which transalates ! from uni%() to a string ! history: ! ver who when what ! --- --- ------ ---------------------------------------------------------------------------------- ! 100 NSR 170315 1. original effort (derived from the misnamed function: UNICODE_TO_ISO_106.FUN) !================================================================================================== ! UTF-8 encoding ! 1. RFC-2279: http://www.faqs.org/rfcs/rfc2279.html ! 2. RFC-3629: https://tools.ietf.org/html/rfc3629 (limits UTF-8 to 4 octets; some code points ! in the 21-bit address space are being used (notice the 'z' on line 4)) ! ! UCS-4 range (hex) UTF-8 octet sequence (binary) Data Bits ! ------------------- ----------------------------- --------- ! 0000,0000-0000,007F 0xxxxxxx 7 bits ! 0000,0080-0000,07FF 110xxxxx 10xxxxxx 11 bits ! 0000,0800-0000,FFFF 1110xxxx 10xxxxxx 10xxxxxx 16 bits ! 0001,0000-001F,FFFF 11110zXX 10xxxxxx 10xxxxxx 10xxxxxx 21 bits (RFC limit) ! 0020,0000-03FF,FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 26 bits (invalid) ! 0400,0000-7FFF,FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 31 bits (invalid) !================================================================================================== option type=explicit ! ! declare string tst$, alt$, src$ ! declare long uni%, tst%, alt%, i%, j%, k%, bytes%, count_out% ! !----------------------------------------------------------------------- ! main !----------------------------------------------------------------------- k% = len(inbound$) ! measure the length of inbound src$ = inbound$ + space$(6) ! tack on 6-spaces for end-of-string processing for i% = 1 to k% ! scan the string tst$ = mid$(src$, i%, 1) ! isolate tst character tst% = asc(tst$) ! convert to ascii if tst% <= 127 then ! count_out% = count_out% + 1 ! uni%(count_out%) = tst% ! store this value goto get_next_char ! next iteration end if ! ! if (tst% and X"e0") = x"c0" then ! test for: 110x-xxxx bytes% = 2 ! this might be a 2-byte sequence (or not) uni% = tst% and x"1f" ! keep 5-bits of octet #1 goto process_uni ! continue below end if ! if (tst% and X"f0") = x"e0" then ! test for: 1110-xxxx bytes% = 3 ! this might be a 3-byte sequence (or not) uni% = tst% and x"0f" ! keep 4-bits of octet #1 goto process_uni ! continue below end if ! if (tst% and X"f8") = x"f0" then ! test for: 1111-0xxx bytes% = 4 ! this might be a 4-byte sequence (or not) uni% = tst% and x"07" ! keep 3-bits of octet #1 goto process_uni ! continue below end if ! ! ! definately not unicode ! select tst% ! test the original code case 128 uni% = x'20ac' case 129 uni% = 0 case 130 uni% = x'201a' case 131 uni% = x'0192' case 132 uni% = x'201e' case 133 uni% = x'2026' case 134 uni% = x'2020' case 135 uni% = x'2021' case 136 uni% = x'02c6' case 137 uni% = x'2030' case 138 uni% = x'0160' case 139 uni% = x'2039' case 140 uni% = x'0152' case 141 uni% = 0 case 142 uni% = x'017d' case 143 uni% = 0 case 144 uni% = 0 case 145 uni% = x'2018' case 146 uni% = x'2019' case 147 uni% = x'201c' case 148 uni% = x'201d' case 149 uni% = x'2022' case 150 uni% = x'2013' case 151 uni% = x'2014' case 152 uni% = x'02dc' case 153 uni% = x'2122' case 154 uni% = x'0161' case 155 uni% = x'203a' case 156 uni% = x'0153' case 157 uni% = 0 case 158 uni% = x'017e' case 159 uni% = x'0178' case else uni% = tst% end select ! if uni% > 0 then ! count_out% = count_out% + 1 ! uni%(count_out%) = uni% ! store this value end if ! goto get_next_char ! next iteration ! ! might be unicode depending upon the following bytes ! entry: i% = points to tmp$ (first utf-8 octet) ! bytes% = expected total number of octects (2-4) ! process_uni: ! for j% = 1 to (bytes%-1) ! alt$ = mid$(src$, i%+j%, 1) ! isolate character after tst$ alt% = asc(alt$) ! if (alt% and x"c0") = x"80" ! is this a secondary utf-8 octet? (10xx-xxxx) then ! yes alt% = (alt% and x"3f") ! isolate 6-bits uni% = uni% * 64% ! shift this by 6 places uni% = uni% + alt% ! merge bits else ! no count_out% = count_out% + 1 ! uni%(count_out%) = tst% ! store original first byte as-is goto get_next_char ! end if ! next j% ! ! count_out% = count_out% + 1 ! uni%(count_out%) = uni% ! is legal UTF-8 so store unicode !------------------------------------------------------------------- i% = i% + bytes% - 1 ! eat some chars (NEXT will eat one more) get_next_char: ! next i% ! advance by tst uni%(0) = count_out% ! mixed_to_unicode = count_out% ! pass back to called end function ! adios !
Back to OpenVMS
Back to OpenVMS Demo Index
Back to Home
Neil Rieck
Waterloo, Ontario, Canada.