implement Btos; # encoding details # (Traditional) Shift-JIS # # 00..1f control characters # 20 space # 21..7f JIS X 0201:1976/1997 roman (see notes) # 80 undefined # 81..9f lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997 # a0 undefined # a1..df JIS X 0201:1976/1997 katakana # e0..ea lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997 # eb..ff undefined # # CP932 (windows-31J) # # this encoding scheme extends Shift-JIS in the following way # # eb..ec undefined (marked as lead bytes - see notes below) # ed..ee lead byte of NEC-selected IBM extended characters # ef undefined (marked as lead byte - see notes below) # f0..f9 lead byte of User defined GAIJI (see note below) # fa..fc lead byte of IBM extended characters # fd..ff undefined # # # Notes # # JISX 0201:1976/1997 roman # this is the same as ASCII but with 0x5c (ASCII code for '\') # representing the Yen currency symbol '¥' (U+00a5) # This mapping is contentious, some conversion packages implent it # others do not. # The mapping files from The Unicode Consortium show cp932 mapping # plain ascii in the range 00..7f whereas shift-jis maps 16r5c ('\') to the yen # symbol (¥) and 16r7e ('~') to overline (¯) # # CP932 double-byte character codes: # # eb-ec, ef, f0-f9: # Marked as DBCS LEAD BYTEs in the unicode mapping data # obtained from: # https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT # # but there are no defined mappings for codes in this range. # It is not clear whether or not an implementation should # consume one or two bytes before emitting an error char. # include "sys.m"; include "convcs.m"; sys : Sys; MAXINT : con 16r7fffffff; BADCHAR : con 16rFFFD; KANAPAGES : con 1; KANAPAGESZ : con 63; KANACHAR0 : con 16ra1; CP932PAGES : con 45; # 81..84, 87..9f, e0..ea, ed..ee, fa..fc CP932PAGESZ : con 189; # 40..fc (including 7f) CP932CHAR0 : con 16r40; shiftjis := 0; page0 := array [256] of { * => BADCHAR }; cp932 : string; dbcsoff := array [256] of { * => -1 }; init(arg : string) : string { sys = load Sys Sys->PATH; shiftjis = arg == "shiftjis"; (error, kana) := getmap("/lib/convcs/jisx0201kana", KANAPAGESZ, KANAPAGES); if (error != nil) return error; (error, cp932) = getmap("/lib/convcs/cp932", CP932PAGESZ, CP932PAGES); if (error != nil) return error; # jisx0201kana is mapped into 16rA1..16rDF for (i := 0; i < KANAPAGESZ; i++) page0[i + KANACHAR0] = kana[i]; # 00..7f same as ascii in cp932 for (i = 0; i <= 16r7f; i++) page0[i] = i; if (shiftjis) { # shift-jis uses JIS X 0201 for the ASCII range # this is the same as ASCII apart from # 16r5c ('\') maps to yen symbol (¥) and 16r7e ('~') maps to overline (¯) page0['\\'] = '¥'; page0['~'] = '¯'; } # pre-calculate DBCS page numbers to mapping file page numbers # and mark codes in page0 that are DBCS lead bytes pnum := 0; for (i = 16r81; i <= 16r84; i++){ page0[i] = -1; dbcsoff[i] = pnum++; } for (i = 16r87; i <= 16r9f; i++){ page0[i] = -1; dbcsoff[i] = pnum++; } for (i = 16re0; i <= 16rea; i++) { page0[i] = -1; dbcsoff[i] = pnum++; } if (!shiftjis) { # add in cp932 extensions for (i = 16red; i <= 16ree; i++) { page0[i] = -1; dbcsoff[i] = pnum++; } for (i = 16rfa; i <= 16rfc; i++) { page0[i] = -1; dbcsoff[i] = pnum++; } } return nil; } btos(nil : Convcs->State, b : array of byte, n : int) : (Convcs->State, string, int) { nbytes := 0; str := ""; if (n == -1) n = MAXINT; for (i := 0; i < len b && len str < n; i++) { b1 := int b[i]; ch := page0[b1]; if (ch != -1) { str[len str] = ch; nbytes++; continue; } # DBCS i++; if (i >= len b) break; pnum := dbcsoff[b1]; ix := (int b[i]) - CP932CHAR0; if (pnum == -1 || ix < 0 || ix >= CP932PAGESZ) str[len str] = BADCHAR; else str[len str] = cp932[(pnum * CP932PAGESZ)+ix]; nbytes += 2; } return (nil, str, nbytes); } getmap(path : string, pgsz, npgs : int) : (string, string) { fd := sys->open(path, Sys->OREAD); if (fd == nil) return (sys->sprint("%s: %r", path), nil); buf := array[(pgsz * npgs) * Sys->UTFmax] of byte; nread := 0; for (;nread < len buf;) { n := sys->read(fd, buf[nread:], Sys->ATOMICIO); if (n <= 0) break; nread += n; } map := string buf[:nread]; if (len map != (pgsz * npgs)) return (sys->sprint("%s: bad data", path), nil); return (nil, map); }