Skip to content

Instantly share code, notes, and snippets.

@ballgoesvroomvroom
Last active May 27, 2022 17:52
Show Gist options
  • Save ballgoesvroomvroom/5c67d0e4a2fcaf8ee3201fbd4ba9f9c1 to your computer and use it in GitHub Desktop.
Save ballgoesvroomvroom/5c67d0e4a2fcaf8ee3201fbd4ba9f9c1 to your computer and use it in GitHub Desktop.
parser to parse entries in cc-cedict to store mapping of simplified characters to their corresponding pinyin with the correct unicode
import json
import re
LINE_SPLIT = re.compile("^.*?\s(.*?)\s\[(.*?)\].*?$") ## group 1 is the simplified, group 2 is the pinyin
TONE_MAP = {
"a": ["ā", "á", "ǎ", "à"],
"e": ["ē", "é", "ě", "è"],
"i": ["ī", "í", "ǐ", "ì"],
"o": ["ō", "ó", "ǒ", "ò"],
"u": ["ū", "ú", "ǔ", "ù"],
"ü": ["ǖ", "ǘ", "ǚ", "ǜ"],
"m": ["m̄", "ḿ", "m̆", "m̀"]
}
if __name__ == "__main__":
data = None
with open("cedict_ts.u8", "r", encoding="utf-8") as f:
data = f.read();
lines = data.split("\n")
result = {} ## store words to pinying (with the actual unicode characters) here
for line in lines:
if (line[0] == "#"):
## no actual data; ignore it
continue
d = LINE_SPLIT.match(line)
simplified, pinyin = d.group(1), d.group(2)
if (simplified in result):
## do nothing, we don't want duplicates, fall back to default reading (assumed to be the first entry of the word)
continue
## parse pinyin using tone map
parsed_pinyin = "" ## slowly build it up
for p in pinyin.split(" "):
if (len(p) == 1):
## single character
parsed_pinyin += " " +p
continue
tone = int(p[-1])
lower_pinyin = p.lower() ## find characters all in lower case form
if (p[-2] == ":"): ## guaranteed to have two characters at least
## use ü
p = p.replace("u", "ü")
p = p[:-2] +p[-1] ## remove the last second element
if (tone == 5):
parsed_pinyin += " " +p[:-1] ## no tone needed, as is (add without the tone declaration at the back)
continue
## find vowel (a, e, i, o, u taking precedence)
## refer to "Rules for placing the tone mark" in https://en.wikipedia.org/wiki/Pinyin
if (lower_pinyin.find("a") != -1):
startIndex = lower_pinyin.find("a")
elif (lower_pinyin.find("e") != -1):
startIndex = lower_pinyin.find("e")
elif (lower_pinyin.find("ou") != -1):
startIndex = lower_pinyin.find("ou")
else:
startIndex = -1
secondIndex = -1 ## if there are two vowels, the second one takes the tone
for v in ["m", "i", "o", "u", "ü"]: ## m first in order to correctly assign tone for 'mi' and 'mo', making m the first vowel
if v in lower_pinyin:
if startIndex == -1:
startIndex = lower_pinyin.find(v)
elif secondIndex == -1:
secondIndex = lower_pinyin.find(v)
if secondIndex != -1:
## second vowel takes precedence
startIndex = secondIndex
vowel = p[startIndex]
toReplace = TONE_MAP[vowel.lower()][tone -1]
## preserve case
if (vowel.isupper()):
toReplace = toReplace.upper() ## handled properly even for those unique unicode characters
p = p.replace(vowel, toReplace)
parsed_pinyin += " " +p[:-1] ## remove tone declaration at the end
result[simplified] = parsed_pinyin[1:] ## remove leading whitespace
## write result into "o.json"
with open("o.json", "r+", encoding="utf-8") as f:
f.write(json.dumps(result))
print("Success!")
"""
Testing
with open("o.json", "r", encoding="utf-8") as f:
d = json.load(f)
print("Querying")
print(d["㺵"])
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment