Skip to content

Instantly share code, notes, and snippets.

@lynn
Last active October 30, 2023 03:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lynn/2f0afeea41271169489171910259829b to your computer and use it in GitHub Desktop.
Save lynn/2f0afeea41271169489171910259829b to your computer and use it in GitHub Desktop.
import itertools
import re
import unicodedata
#
# S T P H * a e m 3
# S K W R * o u q 2 1
# + O E U
#
onsets = {
"": "",
"PW": "b",
"KR": "c",
"KH": "ch",
"TK": "d",
"TP": "f",
"TKPW": "g",
"H": "h",
"SKWR": "j",
"K": "k",
"HR": "l",
"PH": "m",
"TPH": "n",
"KWR": "nh",
"P": "p",
"R": "r",
"S": "s",
"SH": "sh",
"T": "t",
"W": "ꝡ",
"KW": "z",
}
vowels = {
"": "a",
"E": "e",
"EU": "ı",
"O": "o",
"U": "u",
}
tails = {
"": "",
"F": "a",
"R": "o",
"P": "e",
"B": "u",
"FR": "ao",
"PB": "ı",
"FP": "eı",
"RB": "oı",
"FRPB": "aı",
}
valid_rimes = """
a u ı o e aq uq ıq oq eq am um ım om em
aı ao oı eı
ua ıa oa ea uaq ıaq oaq uam ıam oam eam
ıu eu ıuq euq ıum eum uı uıq uım
uo ıo eo uoq ıoq eoq uom ıom eom
ue ıe oe ueq ıeq oeq uem ıem oem
uaı uao uoı ueı
ıaı ıao ıoı ıeı
oaı oao oeı
eaı eao eoı
""".split()
def in_tone(s, num):
s = unicodedata.normalize("NFKD", s)
s = re.sub("[\u0300\u0301\u0308\u0302]", "", s)
d = ["", "\u0300", "\u0301", "\u0308", "\u0302"][num]
return unicodedata.normalize(
"NFKC",
re.sub("[aeiıou]", lambda m: m[0].replace('ı', 'i' if num > 1 else 'ı') + d, s, count=1, flags=re.I),
)
def underdot(s):
s = unicodedata.normalize("NFKD", s)
s = re.sub("\u0323", "", s)
return unicodedata.normalize(
"NFKC",
re.sub(
"[aeiıou][\u0300\u0301\u0308\u0302]?",
lambda m: m[0] + "\u0323",
s,
count=1,
flags=re.I,
),
)
def to_toaq(stroke):
if "*" in stroke or stroke in ("", "-S", "-SZ", "-Z"):
raise KeyError
m = re.search("(S?T?K?P?W?H?R?)(A?O?[-*]?E?U?)(F?R?P?B?)(L?G?)(T?S?D?Z?)", stroke)
if m:
onset, vowel, tail, extra, tone = m.groups()
if "D" in extra:
raise KeyError
cont = "A" in vowel
vowel = re.sub("[-*A]", "", vowel)
if onset == "" and vowel == "" and tail == "F":
raku = "a"
elif vowel == "" and (len(tail) == 1 or tail == "PB"):
# Something like "Ke" instead of "KE".
raise KeyError
else:
# But we do want to write CF syllables as "Kao, Kae, Kaoeu" etc:
v = "" if vowel == "" and tail else vowels[vowel]
raku = onsets[onset] + v + tails[tail]
if "G" in extra:
raku += "q"
elif "L" in extra:
raku += "m"
elif extra:
raise KeyError
if raku.lstrip("bcdfghjklmnprstꝡz'") not in valid_rimes:
raise KeyError
if tone == "TS":
raku = in_tone(raku, 4)
elif tone == "T":
raku = in_tone(raku, 3)
elif tone == "S":
raku = in_tone(raku, 2)
elif tone == "Z":
raku = in_tone(raku, 1)
elif tone:
raise KeyError
glue = "{^}" if cont else ""
oaomo = "'" if cont and onset == "" else ""
return glue + oaomo + raku
LONGEST_KEY = 4
def lookup(key):
if len(key) >= 2:
word = to_toaq(key[0])
for k in key[1:]:
if k == "-S":
word = in_tone(word, 2)
elif k == "-T":
word = in_tone(word, 3)
elif k == "-TS":
word = in_tone(word, 4)
elif k == "-B":
word = underdot(word) + "{^}"
elif "A" in k:
word += to_toaq(k)
else:
raise KeyError
return word
else:
return to_toaq(key[0])
def subsequences(s):
for k in range(0, len(s)+1):
for c in itertools.combinations(s, k):
yield "".join(c)
if __name__ == "__main__":
d = {}
for left in subsequences("STKPWHR"):
for v in subsequences("AO*EU"):
for right in subsequences("FRPBLGTSZ"):
v = v or "-"
stroke = (left + v + right).rstrip("-")
if stroke in d:
print("Duplicate", stroke)
try:
d[stroke] = to_toaq(stroke)
except KeyError:
pass
import json
with open("toaq.json", "w") as f:
f.write(json.dumps(d, ensure_ascii=False, indent=0))
print("Done!", len(d))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment