Skip to content

Instantly share code, notes, and snippets.

Created May 22, 2021 22:44
Show Gist options
  • Save Egor-Skriptunoff/44a88f64f9a497919db4ad8c28259a8f to your computer and use it in GitHub Desktop.
Save Egor-Skriptunoff/44a88f64f9a497919db4ad8c28259a8f to your computer and use it in GitHub Desktop.
String converter between Windows ANSI and UTF-8 encodings
-- Converter between win-125x and UTF-8 strings
-- Written in pure Lua, compatible with Lua 5.1-5.4
-- Usage example:
-- require("win-125x")
-- str_win = utf8_to_win(str_utf8)
-- str_utf8 = win_to_utf8(str_win)
local codepage = 1251 -- Set your codepage here
-- The following codepages are supported:
-- 874 Thai
-- 1250 Central European
-- 1251 Cyrillic
-- 1252 Western
-- 1253 Greek
-- 1254 Turkish
-- 1255 Hebrew
-- 1256 Arabic
-- 1257 Baltic
-- 1258 Vietnamese
local compressed_mappings = {
-- Unicode to win-125x mappings are taken from, compressed and protected by a checksum
[874] = -- Thai, 97 codepoints above U+007F
[1250] = -- Central European, 123 codepoints above U+007F
[1251] = -- Cyrillic, 127 codepoints above U+007F
[1252] = -- Western, 123 codepoints above U+007F
[1253] = -- Greek, 111 codepoints above U+007F
[1254] = -- Turkish, 121 codepoints above U+007F
[1255] = -- Hebrew, 105 codepoints above U+007F
[1256] = -- Arabic, 128 codepoints above U+007F
[1257] = -- Baltic, 116 codepoints above U+007F
[1258] = -- Vietnamese, 119 codepoints above U+007F
local char, byte, gmatch, floor, string_reverse = string.char, string.byte, string.gmatch, math.floor, string.reverse
local table_insert, table_concat = table.insert, table.concat
local function decompress_mapping()
local width, offset, base, CS1, CS2, get_next_char = 1.0, 0.0, 0.0, 7^18, 5^22, gmatch(compressed_mappings[codepage], "%S")
local mapping, rev_mapping, trees, unicode, ansi, prev_delta_unicode, prev_delta_ansi = {}, {}, {}, 0x7F, 0x7F
local function decompress_selection(qty, tree)
while width <= 94^7 do
width, offset, base = width * 94.0, offset * 94.0 + byte(get_next_char()) - 33.0, (base - floor((base + width - 1) / 94^7) * 94^7) * 94.0
if qty then
local big_qty = width % qty
local small_unit = (width - big_qty) / qty
local big_unit = small_unit + 1.0
local offset_small = big_qty * big_unit
local from, offset_from, left, right
if offset < offset_small then
width = big_unit
offset_from = offset - offset % big_unit
from = offset_from / big_unit
width = small_unit
offset_from = offset - (offset - offset_small) % small_unit
from = big_qty + (offset_from - offset_small) / small_unit
local len, leaf = 1.0, from
if tree then
leaf, left, right = 4, 0, qty
local middle = tree[leaf]
if from < middle then
right = middle
left, leaf = middle, leaf + 1
leaf = tree[leaf + 1]
until leaf < 0
from, len = left, right - left
offset_from = left < big_qty and left * big_unit or offset_small + (left - big_qty) * small_unit
width = (right < big_qty and right * big_unit or offset_small + (right - big_qty) * small_unit) - offset_from
base, offset = base + offset_from, offset - offset_from
CS1, CS2 = (CS1 % 93471801.0) * (CS2 % 93471811.0) + qty, (CS1 % 93471821.0) * (CS2 % 93471831.0) - from * 773.0 - len * 7789.0
return leaf
assert((CS1 - CS2) % width == offset)
local function get_delta(tree_idx)
local tree = trees[tree_idx]
local val = tree[3]
if val == 0.0 then
local leaf = decompress_selection(tree[1], tree)
local max_exp_cnt = tree[2]
val = leaf % max_exp_cnt
leaf = (leaf - val) / max_exp_cnt + 2.0
val = 2.0^val
val = val + decompress_selection(val)
if leaf ~= 0.0 then
return leaf * val
tree[3] = val - 1.0
for tree_idx = 1, 2 do
local total_freq = decompress_selection(2^15)
local max_exp_cnt = decompress_selection(17)
local tree, qty_for_leaf_info = {total_freq, max_exp_cnt, 0.0}, 3 * max_exp_cnt
local function build_subtree(left, right, idx)
local middle, subtree = left + 1
middle = decompress_selection(right - middle) + middle
tree[idx], idx = middle, idx + 3
for next_idx = idx - 2, idx - 1 do
if decompress_selection(2) == 1 then
subtree, idx = idx, build_subtree(left, middle, idx)
subtree = decompress_selection(qty_for_leaf_info) - qty_for_leaf_info
tree[next_idx], left, middle = subtree, middle, right
return idx
build_subtree(0, total_freq, 4)
trees[tree_idx] = tree
while true do
local delta = get_delta(1)
if not delta then
delta = prev_delta_unicode
elseif delta == prev_delta_unicode then
return mapping, rev_mapping
unicode, prev_delta_unicode, delta = unicode + delta, delta, get_delta(2) or prev_delta_ansi
ansi, prev_delta_ansi = ansi + delta, delta
mapping[unicode] = ansi
rev_mapping[ansi] = unicode
local map_unicode_to_ansi, map_ansi_to_unicode = decompress_mapping()
function utf8_to_win(str)
local result_ansi = {}
for u in gmatch(str, ".[\128-\191]*") do
local code = byte(u)%2^(8-#u)
for j = 2, #u do
code = (code-2)*64+byte(u,j)
table_insert(result_ansi, char(code < 128 and code or map_unicode_to_ansi[code] or byte"?"))
return table_concat(result_ansi)
function win_to_utf8(str)
local result_utf8 = {}
for pos = #str, 1, -1 do
local code, h = byte(str, pos), 127
code = code < 128 and code or map_ansi_to_unicode[code] or byte"?"
while code > h do
table_insert(result_utf8, char(128 + code%64))
code, h = floor(code/64), 288067%h
table_insert(result_utf8, char((127-h)*2+code))
return string_reverse(table_concat(result_utf8))
Copy link

For the string "Météo" it correctly replaces UTF8 "\195\131" with win1252 "\195"
For the string "Téléachat" it correctly replaces UTF8 "\195\169" with win1252 "\233"

local str_utf8 = "Téléachat"
assert(str_utf8 == "\84\195\169\108\195\169\97\99\104\97\116")
local str_win = utf8_to_win(str_utf8)
assert(str_win == "\84\233\108\233\97\99\104\97\116")

Probably, the problem is how the string is displayed on your side.
For example, to view correct win-1252 symbols in Windows command line (cmd.exe) you need to invoke chcp 1252 before running your program.
Please show a screenshot.

Copy link

Shot from my Grid where 'é' is shown as 'i' (the words I mentioned above are also extracted from the same grid).
I tried all languages and they perfectly work. Only french is making these troubles for me.

Copy link

'Probably, the problem is how the string is displayed on your side.'
Yes Sir! I tried it on two other pc and the module is just perfect. All characters are showing right ! I don't know what is goin' on with the one I've got these troubles on.
I really do appreciate your help and the good advise.
Congrats again.
Thank you very much and good weekend Sir.

Copy link

movalex commented Feb 26, 2023

So the tool would only work with the characters that correspond with the local machine's codepage, right?

Copy link

e-skri commented Aug 17, 2023

@movalex - Yes, characters must be from your 1-byte Windows ANSI codepage.

Copy link

Thank you!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment