Last active
May 27, 2022 17:52
-
-
Save ballgoesvroomvroom/5c67d0e4a2fcaf8ee3201fbd4ba9f9c1 to your computer and use it in GitHub Desktop.
parser to parse entries in cc-cedict to store mapping of simplified characters to their corresponding pinyin with the correct unicode
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import re | |
LINE_SPLIT = re.compile("^.*?\s(.*?)\s\[(.*?)\].*?$") ## group 1 is the simplified, group 2 is the pinyin | |
TONE_MAP = { | |
"a": ["ā", "á", "ǎ", "à"], | |
"e": ["ē", "é", "ě", "è"], | |
"i": ["ī", "í", "ǐ", "ì"], | |
"o": ["ō", "ó", "ǒ", "ò"], | |
"u": ["ū", "ú", "ǔ", "ù"], | |
"ü": ["ǖ", "ǘ", "ǚ", "ǜ"], | |
"m": ["m̄", "ḿ", "m̆", "m̀"] | |
} | |
if __name__ == "__main__": | |
data = None | |
with open("cedict_ts.u8", "r", encoding="utf-8") as f: | |
data = f.read(); | |
lines = data.split("\n") | |
result = {} ## store words to pinying (with the actual unicode characters) here | |
for line in lines: | |
if (line[0] == "#"): | |
## no actual data; ignore it | |
continue | |
d = LINE_SPLIT.match(line) | |
simplified, pinyin = d.group(1), d.group(2) | |
if (simplified in result): | |
## do nothing, we don't want duplicates, fall back to default reading (assumed to be the first entry of the word) | |
continue | |
## parse pinyin using tone map | |
parsed_pinyin = "" ## slowly build it up | |
for p in pinyin.split(" "): | |
if (len(p) == 1): | |
## single character | |
parsed_pinyin += " " +p | |
continue | |
tone = int(p[-1]) | |
lower_pinyin = p.lower() ## find characters all in lower case form | |
if (p[-2] == ":"): ## guaranteed to have two characters at least | |
## use ü | |
p = p.replace("u", "ü") | |
p = p[:-2] +p[-1] ## remove the last second element | |
if (tone == 5): | |
parsed_pinyin += " " +p[:-1] ## no tone needed, as is (add without the tone declaration at the back) | |
continue | |
## find vowel (a, e, i, o, u taking precedence) | |
## refer to "Rules for placing the tone mark" in https://en.wikipedia.org/wiki/Pinyin | |
if (lower_pinyin.find("a") != -1): | |
startIndex = lower_pinyin.find("a") | |
elif (lower_pinyin.find("e") != -1): | |
startIndex = lower_pinyin.find("e") | |
elif (lower_pinyin.find("ou") != -1): | |
startIndex = lower_pinyin.find("ou") | |
else: | |
startIndex = -1 | |
secondIndex = -1 ## if there are two vowels, the second one takes the tone | |
for v in ["m", "i", "o", "u", "ü"]: ## m first in order to correctly assign tone for 'mi' and 'mo', making m the first vowel | |
if v in lower_pinyin: | |
if startIndex == -1: | |
startIndex = lower_pinyin.find(v) | |
elif secondIndex == -1: | |
secondIndex = lower_pinyin.find(v) | |
if secondIndex != -1: | |
## second vowel takes precedence | |
startIndex = secondIndex | |
vowel = p[startIndex] | |
toReplace = TONE_MAP[vowel.lower()][tone -1] | |
## preserve case | |
if (vowel.isupper()): | |
toReplace = toReplace.upper() ## handled properly even for those unique unicode characters | |
p = p.replace(vowel, toReplace) | |
parsed_pinyin += " " +p[:-1] ## remove tone declaration at the end | |
result[simplified] = parsed_pinyin[1:] ## remove leading whitespace | |
## write result into "o.json" | |
with open("o.json", "r+", encoding="utf-8") as f: | |
f.write(json.dumps(result)) | |
print("Success!") | |
""" | |
Testing | |
with open("o.json", "r", encoding="utf-8") as f: | |
d = json.load(f) | |
print("Querying") | |
print(d["㺵"]) | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment