-
-
Save lynn/2f0afeea41271169489171910259829b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import itertools | |
import re | |
import unicodedata | |
# | |
# S T P H * a e m 3 | |
# S K W R * o u q 2 1 | |
# + O E U | |
# | |
onsets = { | |
"": "", | |
"PW": "b", | |
"KR": "c", | |
"KH": "ch", | |
"TK": "d", | |
"TP": "f", | |
"TKPW": "g", | |
"H": "h", | |
"SKWR": "j", | |
"K": "k", | |
"HR": "l", | |
"PH": "m", | |
"TPH": "n", | |
"KWR": "nh", | |
"P": "p", | |
"R": "r", | |
"S": "s", | |
"SH": "sh", | |
"T": "t", | |
"W": "ꝡ", | |
"KW": "z", | |
} | |
vowels = { | |
"": "a", | |
"E": "e", | |
"EU": "ı", | |
"O": "o", | |
"U": "u", | |
} | |
tails = { | |
"": "", | |
"F": "a", | |
"R": "o", | |
"P": "e", | |
"B": "u", | |
"FR": "ao", | |
"PB": "ı", | |
"FP": "eı", | |
"RB": "oı", | |
"FRPB": "aı", | |
} | |
valid_rimes = """ | |
a u ı o e aq uq ıq oq eq am um ım om em | |
aı ao oı eı | |
ua ıa oa ea uaq ıaq oaq uam ıam oam eam | |
ıu eu ıuq euq ıum eum uı uıq uım | |
uo ıo eo uoq ıoq eoq uom ıom eom | |
ue ıe oe ueq ıeq oeq uem ıem oem | |
uaı uao uoı ueı | |
ıaı ıao ıoı ıeı | |
oaı oao oeı | |
eaı eao eoı | |
""".split() | |
def in_tone(s, num): | |
s = unicodedata.normalize("NFKD", s) | |
s = re.sub("[\u0300\u0301\u0308\u0302]", "", s) | |
d = ["", "\u0300", "\u0301", "\u0308", "\u0302"][num] | |
return unicodedata.normalize( | |
"NFKC", | |
re.sub("[aeiıou]", lambda m: m[0].replace('ı', 'i' if num > 1 else 'ı') + d, s, count=1, flags=re.I), | |
) | |
def underdot(s): | |
s = unicodedata.normalize("NFKD", s) | |
s = re.sub("\u0323", "", s) | |
return unicodedata.normalize( | |
"NFKC", | |
re.sub( | |
"[aeiıou][\u0300\u0301\u0308\u0302]?", | |
lambda m: m[0] + "\u0323", | |
s, | |
count=1, | |
flags=re.I, | |
), | |
) | |
def to_toaq(stroke): | |
if "*" in stroke or stroke in ("", "-S", "-SZ", "-Z"): | |
raise KeyError | |
m = re.search("(S?T?K?P?W?H?R?)(A?O?[-*]?E?U?)(F?R?P?B?)(L?G?)(T?S?D?Z?)", stroke) | |
if m: | |
onset, vowel, tail, extra, tone = m.groups() | |
if "D" in extra: | |
raise KeyError | |
cont = "A" in vowel | |
vowel = re.sub("[-*A]", "", vowel) | |
if onset == "" and vowel == "" and tail == "F": | |
raku = "a" | |
elif vowel == "" and (len(tail) == 1 or tail == "PB"): | |
# Something like "Ke" instead of "KE". | |
raise KeyError | |
else: | |
# But we do want to write CF syllables as "Kao, Kae, Kaoeu" etc: | |
v = "" if vowel == "" and tail else vowels[vowel] | |
raku = onsets[onset] + v + tails[tail] | |
if "G" in extra: | |
raku += "q" | |
elif "L" in extra: | |
raku += "m" | |
elif extra: | |
raise KeyError | |
if raku.lstrip("bcdfghjklmnprstꝡz'") not in valid_rimes: | |
raise KeyError | |
if tone == "TS": | |
raku = in_tone(raku, 4) | |
elif tone == "T": | |
raku = in_tone(raku, 3) | |
elif tone == "S": | |
raku = in_tone(raku, 2) | |
elif tone == "Z": | |
raku = in_tone(raku, 1) | |
elif tone: | |
raise KeyError | |
glue = "{^}" if cont else "" | |
oaomo = "'" if cont and onset == "" else "" | |
return glue + oaomo + raku | |
LONGEST_KEY = 4 | |
def lookup(key): | |
if len(key) >= 2: | |
word = to_toaq(key[0]) | |
for k in key[1:]: | |
if k == "-S": | |
word = in_tone(word, 2) | |
elif k == "-T": | |
word = in_tone(word, 3) | |
elif k == "-TS": | |
word = in_tone(word, 4) | |
elif k == "-B": | |
word = underdot(word) + "{^}" | |
elif "A" in k: | |
word += to_toaq(k) | |
else: | |
raise KeyError | |
return word | |
else: | |
return to_toaq(key[0]) | |
def subsequences(s): | |
for k in range(0, len(s)+1): | |
for c in itertools.combinations(s, k): | |
yield "".join(c) | |
if __name__ == "__main__": | |
d = {} | |
for left in subsequences("STKPWHR"): | |
for v in subsequences("AO*EU"): | |
for right in subsequences("FRPBLGTSZ"): | |
v = v or "-" | |
stroke = (left + v + right).rstrip("-") | |
if stroke in d: | |
print("Duplicate", stroke) | |
try: | |
d[stroke] = to_toaq(stroke) | |
except KeyError: | |
pass | |
import json | |
with open("toaq.json", "w") as f: | |
f.write(json.dumps(d, ensure_ascii=False, indent=0)) | |
print("Done!", len(d)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment