Skip to content

Instantly share code, notes, and snippets.

@bunyk
Created June 19, 2019 09:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bunyk/688a457acfc24f682d8bc2ef1a00d693 to your computer and use it in GitHub Desktop.
Save bunyk/688a457acfc24f682d8bc2ef1a00d693 to your computer and use it in GitHub Desktop.
trans_table = {
'A': ('A', 0), 'a': ('a', 0),
'Ą': ('A', 1), 'ą': ('a', 1),
'C': ('C', 0), 'c': ('c', 0),
'Ć': ('C', 1), 'ć': ('c', 1),
'E': ('E', 0), 'e': ('e', 0),
'Ę': ('E', 1), 'ę': ('e', 1),
'L': ('L', 0), 'l': ('l', 0),
'Ł': ('L', 1), 'ł': ('l', 1),
'N': ('N', 0), 'n': ('n', 0),
'Ń': ('N', 1), 'ń': ('n', 1),
'O': ('O', 0), 'o': ('o', 0),
'Ó': ('O', 1), 'ó': ('o', 1),
'S': ('S', 0), 's': ('s', 0),
'Ś': ('S', 1), 'ś': ('s', 1),
'Z': ('Z', 0), 'z': ('z', 0),
'Ź': ('Z', 2), 'ź': ('z', 2),
'Ż': ('Z', 3), 'ż': ('z', 3),
}
ternary_chars = {'Z', 'z'}
def pol2ascii(text):
plain = []
diacritics = []
for c in text:
if c in trans_table:
ascii_char, diacritic = trans_table.get(c, (c, 0))
diacritics.append(bin(diacritic)[2:][::-1])
else:
ascii_char = c
plain.append(ascii_char)
return ''.join(plain) + '_' + hex(int(''.join(reversed(diacritics)), 2))[2:]
reverse_trans_table = {
k: v for v, k in trans_table.items()
}
def ascii2pol(text):
plain, diacritics = text.rsplit('_', 1)
diacritics = int(diacritics, base=16)
res = []
for c in plain:
if c in trans_table:
diacritic = diacritics % 2
diacritics = diacritics // 2
if c in ternary_chars and diacritic == 1:
diacritic = diacritic * 2 + (diacritics % 2)
diacritics = diacritics // 2
pol_char = reverse_trans_table.get((c, diacritic), c)
else:
pol_char = c
res.append(pol_char)
return ''.join(res)
TESTS = '''
Świętosław Milczący
Dzierżykraj Łaźniński
Józef Soćko
jedną rzucających się zachodzących języka męskorzeczowego męskożywotny. słów, które w polszczyźnie standardowej były
'''
for l in TESTS.strip().splitlines():
plain = pol2ascii(l)
original = ascii2pol(plain)
print(original, plain, f'+{(len(plain) / len(original) - 1) * 100:.1f}%')
print(l)
assert original == l
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment