Skip to content

Instantly share code, notes, and snippets.

@nazartm
Created October 7, 2022 15:03
Show Gist options
  • Save nazartm/a57040cca921bb720eb583b88e8f3d83 to your computer and use it in GitHub Desktop.
Save nazartm/a57040cca921bb720eb583b88e8f3d83 to your computer and use it in GitHub Desktop.
Transliterates from cyrillic turkmen alphabet to latin turkmen alphabet
import re
import sys
special_cases = [r'(Ё|Ю|Я)\.', r'(\s)Е\.', r'(\s)е', r'(\s)Е', r'(\s)?("|«|“)е', r'(\s)?("|«|“)Е', r'^Е', r'^е']
special_case_mappings= ['Ý.', r'\1Ý.', r'\1ýe', r'\1Ýe', r'\1\2ýe', r'\1\2Ýe', 'Ýe', 'ýe']
turkmen_cyrillic_lc = ("а", "б", "в", "г", "д", "ъе", "е", "ё", "ж", "җ", "з", "и", "й", "к", "л", "м", "н", "ң", "о", "ө", "п", "р", "с", "т", "у", "ү", "ф", "х", "ц", "ч", "ш", "щ", "ъ", "ы", "ь", "э", "ә", "ю", "я")
turkmen_latin_lc = ("a", "b", "w", "g", "d", "ýe", "e", "ýo", "ž", "j", "z", "i", "ý", "k", "l", "m", "n", "ň", "o", "ö", "p", "r", "s", "t", "u", "ü", "f", "h", "s", "ç", "ş", "ş", "", "y", "", "e", "ä", "ýu", "ýa")
turkmen_cyrillic_uc = ("А", "Б", "В", "Г", "Д", "ъе", "Е", "Ё", "Ж", "Җ", "З", "И", "Й", "К", "Л", "М", "Н", "Ң", "О", "Ө", "П", "Р", "С", "Т", "У", "Ү", "Ф", "Х", "Ц", "Ч", "Ш", "Щ", "Ъ", "Ы", "Ь", "Э", "Ә", "Ю", "Я")
turkmen_latin_uc = ("A", "B", "W", "G", "D", "Ýe", "E", "Ýo", "Ž", "J", "Z", "I", "Ý", "K", "L", "M", "N", "Ň", "O", "Ö", "P", "R", "S", "T", "U", "Ü", "F", "H", "S", "Ç", "Ş", "Ş", "", "Y", "", "E", "Ä", "Ýu", "Ýa")
fix_patterns = (r'(a|i|ö|ä|e|y|ü)e', r'(A|I|Ö|Ä|E|Y|Ü)e', r'(A|I|Ö|Ä|E|Y|Ü)E', r'Ý(a|e|u|o)([A-ZÝŽŇÖÜÇŞÄ]+)')
fix_replacements = (r'\1ýe', r'\1ýe', r'\1ÝE', lambda m: m.group(0).upper())
def batch_regex(text, regex_needles, replacements):
for i, needle in enumerate(regex_needles):
text = re.sub(needle, replacements[i], text)
return text
def batch_replace(text, needles, replacements):
for i, needle in enumerate(needles):
text = text.replace(needle, replacements[i])
return text
"""
Transliterates from cyrillic turkmen alphabet to latin turkmen alphabet
"""
def transliterate(text):
text = batch_regex(text, special_cases, special_case_mappings)
text = batch_replace(text, turkmen_cyrillic_lc, turkmen_latin_lc)
text = batch_replace(text, turkmen_cyrillic_uc, turkmen_latin_uc)
text = batch_regex(text, fix_patterns, fix_replacements)
return text
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Usage: python3 transliterate.py <filename>")
sys.exit(0)
input_file = sys.argv[1]
with open(input_file) as f:
text = f.read()
transliterated = transliterate(text)
print(transliterated)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment