Skip to content

Instantly share code, notes, and snippets.

@Egor-Skriptunoff
Created May 22, 2021 22:44
Show Gist options
  • Save Egor-Skriptunoff/44a88f64f9a497919db4ad8c28259a8f to your computer and use it in GitHub Desktop.
Save Egor-Skriptunoff/44a88f64f9a497919db4ad8c28259a8f to your computer and use it in GitHub Desktop.
String converter between Windows ANSI and UTF-8 encodings
---------------------------------------------------------------------
-- Converter between win-125x and UTF-8 strings
---------------------------------------------------------------------
-- Written in pure Lua, compatible with Lua 5.1-5.4
-- Usage example:
-- require("win-125x")
-- str_win = utf8_to_win(str_utf8)
-- str_utf8 = win_to_utf8(str_win)
---------------------------------------------------------------------
local codepage = 1251 -- Set your codepage here
-- The following codepages are supported:
-- 874 Thai
-- 1250 Central European
-- 1251 Cyrillic
-- 1252 Western
-- 1253 Greek
-- 1254 Turkish
-- 1255 Hebrew
-- 1256 Arabic
-- 1257 Baltic
-- 1258 Vietnamese
do
local compressed_mappings = {
-- Unicode to win-125x mappings are taken from unicode.org, compressed and protected by a checksum
[874] = -- Thai, 97 codepoints above U+007F
[[!%l+$"""WN^9=&$pqF'oheO#;0l#"hs)mI[=e!ufwkDB#OwLnJ|IRIUz8Q(MMM]],
[1250] = -- Central European, 123 codepoints above U+007F
[[!<2#?v"1(ro;xh/tL_3hC^i;e~PjO"p<I\aTT};]Rb~M7/]&jRjfwuE%AJ)@XfBQy&\jy[V5:]!RtH]m>Yd8m?6LpsUA\V=x'VcMO<Wz+EOO
0m7U`u|$Y5x?Vk*6+qJ@/0Lie77_b}OEuwv$Qj/w`+J>M*<g2qxD3qEyC&*{VGI'UddQ`GQ)L=lj<{S;Jm),f3yzcQOuxacHSZ{X'XIWzDz!?E
=U0f]],
[1251] = -- Cyrillic, 127 codepoints above U+007F
[[!-[;_8kMai7j]xB$^n)#7ngrX}_b%{<Cdot;P?2J&00&^wX|;]@N*fjq#ioX'v.&gG@ur~3yi8t1;xn40{G#NX?7+hGC{$D"4#oJ//~kflzs
"_\z9qP#}1o|@{t`2NrM%t{MW?X9d6o:MqHl6+z]],
[1252] = -- Western, 123 codepoints above U+007F
[[!)W$<c~\OdA5TJ%/J/{:yoE]K[d,c<Mv+gp_[_UuB52c;H&{leFk%Kd8%cHnvLrB[>|:)t.}QH*)]AD|LqjsB+JCdKmbRIjO,]],
[1253] = -- Greek, 111 codepoints above U+007F
[[!./yDCq;#WAuC\C1R{=[n'FpSuc!"R\EZ|4&J?A3-z?*TI?ufbhFq1J!x@Sjff\!G{o^dDXl|8NLZ!$d'8$f^=hh_DPm!<>>bCgV(>erUWhX
?R+-JP@4ju:Yw#*C]],
[1254] = -- Turkish, 121 codepoints above U+007F
[[!-(R[SPKY>cgcK5cCs4vk%MuL`yFx^Bl#/!l#M@#yoe|Jx+pxZuvh%r>O</n_gb>hDjmG]j#lA{]2"R-Z@(6Wy:Q~%;327b&fRSkF#BM/d+%
iWmSx4E*\F_z=s>QeJBqC^]],
[1255] = -- Hebrew, 105 codepoints above U+007F
[[!.b\.H?S\21+7efm'`w&MW_Jg,mRbB;{X@T\3::DC#7<m_cAE!:%C%c7/,./u[8w*h-iwpz03QY,ay%]MI*D]W&]UG^3(=20a7$zG[Ng7MLt
sXIne(V37A?OO%|Hn13wMh-?^jNzhW`,-]],
[1256] = -- Arabic, 128 codepoints above U+007F
[[!3n8GE$.to/ka%Nx`uOpcib>|9KU-N72!1J4c2NAUE3a,HlOE=M`@rsa||Nh_!og]:dILz9KNlF~vigNH*a0KxwjjfR*]?tO87(a3-RQex^V
Ww&SY{:AqE|s%}@U8%rKcr0,NCjR:N&L'YyGu<us'sN*1pl=gAXOwSJ[v?f;imBhDu_)d$F8T?%S[]],
[1257] = -- Baltic, 116 codepoints above U+007F
[[!:<_.XQ[;n35s%I?g9)b/7DiGwIR)zy&=6?/3)6iO%rSnC_6yjl'8#zeN0vcW_yX/2*J93+EJVrW,^Rhe,h7wWl"}neF2~F[PyD;BcrG*5=J
fh<x!FJ?qSw9Xp!;WB3T<J^x?#Ie`xufezR'\I(eED]3d&)VJL$/+$Zf;W^I>L[3D5F<_IcGpn=oX"JR1%arS|FX|dia4]BeF>d5p`EV+:;*I<
x^Voq{"f]],
[1258] = -- Vietnamese, 119 codepoints above U+007F
[[!3n8C{%C0}&p3gE0~|&RVm9Wr&^ln1}'$gV{bml1oByN*bb:Bm^E;~B3-WjF6Qubq^`Y*6\0^w!DKpK<\7lHVELmSXN{2~B"0C"<1CYN2{$a
5M?>|7%~qm{pXphwm3$}iyXjBYwtGqxp(f[!g^Ee9H.}1~0H-k-dzNDh1L]],
}
local char, byte, gmatch, floor, string_reverse = string.char, string.byte, string.gmatch, math.floor, string.reverse
local table_insert, table_concat = table.insert, table.concat
local function decompress_mapping()
local width, offset, base, CS1, CS2, get_next_char = 1.0, 0.0, 0.0, 7^18, 5^22, gmatch(compressed_mappings[codepage], "%S")
local mapping, rev_mapping, trees, unicode, ansi, prev_delta_unicode, prev_delta_ansi = {}, {}, {}, 0x7F, 0x7F
local function decompress_selection(qty, tree)
while width <= 94^7 do
width, offset, base = width * 94.0, offset * 94.0 + byte(get_next_char()) - 33.0, (base - floor((base + width - 1) / 94^7) * 94^7) * 94.0
end
if qty then
local big_qty = width % qty
local small_unit = (width - big_qty) / qty
local big_unit = small_unit + 1.0
local offset_small = big_qty * big_unit
local from, offset_from, left, right
if offset < offset_small then
width = big_unit
offset_from = offset - offset % big_unit
from = offset_from / big_unit
else
width = small_unit
offset_from = offset - (offset - offset_small) % small_unit
from = big_qty + (offset_from - offset_small) / small_unit
end
local len, leaf = 1.0, from
if tree then
leaf, left, right = 4, 0, qty
repeat
local middle = tree[leaf]
if from < middle then
right = middle
else
left, leaf = middle, leaf + 1
end
leaf = tree[leaf + 1]
until leaf < 0
from, len = left, right - left
offset_from = left < big_qty and left * big_unit or offset_small + (left - big_qty) * small_unit
width = (right < big_qty and right * big_unit or offset_small + (right - big_qty) * small_unit) - offset_from
end
base, offset = base + offset_from, offset - offset_from
CS1, CS2 = (CS1 % 93471801.0) * (CS2 % 93471811.0) + qty, (CS1 % 93471821.0) * (CS2 % 93471831.0) - from * 773.0 - len * 7789.0
return leaf
end
assert((CS1 - CS2) % width == offset)
end
local function get_delta(tree_idx)
local tree = trees[tree_idx]
local val = tree[3]
if val == 0.0 then
local leaf = decompress_selection(tree[1], tree)
local max_exp_cnt = tree[2]
val = leaf % max_exp_cnt
leaf = (leaf - val) / max_exp_cnt + 2.0
val = 2.0^val
val = val + decompress_selection(val)
if leaf ~= 0.0 then
return leaf * val
end
end
tree[3] = val - 1.0
end
for tree_idx = 1, 2 do
local total_freq = decompress_selection(2^15)
local max_exp_cnt = decompress_selection(17)
local tree, qty_for_leaf_info = {total_freq, max_exp_cnt, 0.0}, 3 * max_exp_cnt
local function build_subtree(left, right, idx)
local middle, subtree = left + 1
middle = decompress_selection(right - middle) + middle
tree[idx], idx = middle, idx + 3
for next_idx = idx - 2, idx - 1 do
if decompress_selection(2) == 1 then
subtree, idx = idx, build_subtree(left, middle, idx)
else
subtree = decompress_selection(qty_for_leaf_info) - qty_for_leaf_info
end
tree[next_idx], left, middle = subtree, middle, right
end
return idx
end
build_subtree(0, total_freq, 4)
trees[tree_idx] = tree
end
while true do
local delta = get_delta(1)
if not delta then
delta = prev_delta_unicode
elseif delta == prev_delta_unicode then
decompress_selection()
return mapping, rev_mapping
end
unicode, prev_delta_unicode, delta = unicode + delta, delta, get_delta(2) or prev_delta_ansi
ansi, prev_delta_ansi = ansi + delta, delta
mapping[unicode] = ansi
rev_mapping[ansi] = unicode
end
end
local map_unicode_to_ansi, map_ansi_to_unicode = decompress_mapping()
function utf8_to_win(str)
local result_ansi = {}
for u in gmatch(str, ".[\128-\191]*") do
local code = byte(u)%2^(8-#u)
for j = 2, #u do
code = (code-2)*64+byte(u,j)
end
table_insert(result_ansi, char(code < 128 and code or map_unicode_to_ansi[code] or byte"?"))
end
return table_concat(result_ansi)
end
function win_to_utf8(str)
local result_utf8 = {}
for pos = #str, 1, -1 do
local code, h = byte(str, pos), 127
code = code < 128 and code or map_ansi_to_unicode[code] or byte"?"
while code > h do
table_insert(result_utf8, char(128 + code%64))
code, h = floor(code/64), 288067%h
end
table_insert(result_utf8, char((127-h)*2+code))
end
return string_reverse(table_concat(result_utf8))
end
end
@kordob29
Copy link

hello congrats for the good job!
I have to mention that for french some characters are wrongly converted ex: é / è /à /.. these are really important to display a correct french.
Is there a way to rectify this?
Thank u again.

@Egor-Skriptunoff
Copy link
Author

hello congrats for the good job! I have to mention that for french some characters are wrongly converted ex: é / è /à /.. these are really important to display a correct french. Is there a way to rectify this? Thank u again.

Please show the example of wrong conversion:

  1. the value of variable codepage at line #11
  2. the function you are using (utf8_to_win or win_to_utf8)
  3. the sequence of bytes in input and output strings as displayed by print(str_win:byte(1,-1));print(str_utf8:byte(1,-1))
  4. the result of conversion you expect
  5. the ANSI codepage of your OS (open cmd.exe, type reg query HKLM\SYSTEM\CurrentControlSet\Control\Nls\CodePage /v ACP and look for the number after the word REG_SZ)

@kordob29
Copy link

Hello and thank you for the prompt reply:
1- Used codepage =1252
2- function: utf8_to_win(str)
3- Bytes sequence (I hope I got the question right)
'Météo' >> 77 195 131 194 169 116 195 131 194 169 111
4- _Here are few words with their expected results:
'Tiliachat' >> 'Téléachat'
'Frangais' >> 'Français'
'dhs' >> 'dès'
'mjme' >> 'même'
5- ANSI codepage of my OS: 1252
Best regards.

@Egor-Skriptunoff
Copy link
Author

For the string "Météo" it correctly replaces UTF8 "\195\131" with win1252 "\195"
For the string "Téléachat" it correctly replaces UTF8 "\195\169" with win1252 "\233"

local str_utf8 = "Téléachat"
assert(str_utf8 == "\84\195\169\108\195\169\97\99\104\97\116")
local str_win = utf8_to_win(str_utf8)
assert(str_win == "\84\233\108\233\97\99\104\97\116")

Probably, the problem is how the string is displayed on your side.
For example, to view correct win-1252 symbols in Windows command line (cmd.exe) you need to invoke chcp 1252 before running your program.
Please show a screenshot.

@kordob29
Copy link

Shot from my Grid where 'é' is shown as 'i' (the words I mentioned above are also extracted from the same grid).
I tried all languages and they perfectly work. Only french is making these troubles for me.
screen

@kordob29
Copy link

'Probably, the problem is how the string is displayed on your side.'
Yes Sir! I tried it on two other pc and the module is just perfect. All characters are showing right ! I don't know what is goin' on with the one I've got these troubles on.
I really do appreciate your help and the good advise.
Congrats again.
Thank you very much and good weekend Sir.

@movalex
Copy link

movalex commented Feb 26, 2023

So the tool would only work with the characters that correspond with the local machine's codepage, right?

@e-skri
Copy link

e-skri commented Aug 17, 2023

@movalex - Yes, characters must be from your 1-byte Windows ANSI codepage.

@decadence
Copy link

Thank you!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment