ballgoesvroomvroom/parser.py

## parser.py
import json
import re

LINE_SPLIT = re.compile("^.*?\s(.*?)\s\[(.*?)\].*?$") ## group 1 is the simplified, group 2 is the pinyin
TONE_MAP = {
	"a": ["ā", "á", "ǎ", "à"],
	"e": ["ē", "é", "ě", "è"],
	"i": ["ī", "í", "ǐ", "ì"],
	"o": ["ō", "ó", "ǒ", "ò"],
	"u": ["ū", "ú", "ǔ", "ù"],
	"ü": ["ǖ", "ǘ", "ǚ", "ǜ"],
	"m": ["m̄", "ḿ", "m̆", "m̀"]
}

if __name__ == "__main__":
	data = None
	with open("cedict_ts.u8", "r", encoding="utf-8") as f:
		data = f.read();

	lines = data.split("\n")
	result = {} ## store words to pinying (with the actual unicode characters) here
	for line in lines:
		if (line[0] == "#"):
			## no actual data; ignore it
			continue
		d = LINE_SPLIT.match(line)

		simplified, pinyin = d.group(1), d.group(2)
		if (simplified in result):
			## do nothing, we don't want duplicates, fall back to default reading (assumed to be the first entry of the word)
			continue

		## parse pinyin using tone map
		parsed_pinyin = "" ## slowly build it up
		for p in pinyin.split(" "):
			if (len(p) == 1):
				## single character
				parsed_pinyin += " " +p
				continue

			tone = int(p[-1])
			lower_pinyin = p.lower() ## find characters all in lower case form

			if (p[-2] == ":"): ## guaranteed to have two characters at least
				## use ü
				p = p.replace("u", "ü")
				p = p[:-2] +p[-1] ## remove the last second element
			if (tone == 5):
				parsed_pinyin += " " +p[:-1] ## no tone needed, as is (add without the tone declaration at the back)
				continue

			## find vowel (a, e, i, o, u taking precedence)
			## refer to "Rules for placing the tone mark" in https://en.wikipedia.org/wiki/Pinyin
			if (lower_pinyin.find("a") != -1):
				startIndex = lower_pinyin.find("a")
			elif (lower_pinyin.find("e") != -1):
				startIndex = lower_pinyin.find("e")
			elif (lower_pinyin.find("ou") != -1):
				startIndex = lower_pinyin.find("ou")
			else:
				startIndex = -1
				secondIndex = -1 ## if there are two vowels, the second one takes the tone
				for v in ["m", "i", "o", "u", "ü"]: ## m first in order to correctly assign tone for 'mi' and 'mo', making m the first vowel
					if v in lower_pinyin:
						if startIndex == -1:
							startIndex = lower_pinyin.find(v)
						elif secondIndex == -1:
							secondIndex = lower_pinyin.find(v)

				if secondIndex != -1:
					## second vowel takes precedence
					startIndex = secondIndex
			vowel = p[startIndex]
			toReplace = TONE_MAP[vowel.lower()][tone -1]

			## preserve case
			if (vowel.isupper()):
				toReplace = toReplace.upper() ## handled properly even for those unique unicode characters

			p = p.replace(vowel, toReplace)
			parsed_pinyin += " " +p[:-1] ## remove tone declaration at the end

		result[simplified] = parsed_pinyin[1:] ## remove leading whitespace

	## write result into "o.json"
	with open("o.json", "r+", encoding="utf-8") as f:
		f.write(json.dumps(result))
	print("Success!")

	"""
	Testing

	with open("o.json", "r", encoding="utf-8") as f:
		d = json.load(f)
		print("Querying")
		print(d["㺵"])
	"""
	import json
	import re

	LINE_SPLIT = re.compile("^.?\s(.?)\s\[(.?)\].?$") ## group 1 is the simplified, group 2 is the pinyin
	TONE_MAP = {
	"a": ["ā", "á", "ǎ", "à"],
	"e": ["ē", "é", "ě", "è"],
	"i": ["ī", "í", "ǐ", "ì"],
	"o": ["ō", "ó", "ǒ", "ò"],
	"u": ["ū", "ú", "ǔ", "ù"],
	"ü": ["ǖ", "ǘ", "ǚ", "ǜ"],
	"m": ["m̄", "ḿ", "m̆", "m̀"]
	}

	if __name__ == "__main__":
	data = None
	with open("cedict_ts.u8", "r", encoding="utf-8") as f:
	data = f.read();

	lines = data.split("\n")
	result = {} ## store words to pinying (with the actual unicode characters) here
	for line in lines:
	if (line[0] == "#"):
	## no actual data; ignore it
	continue
	d = LINE_SPLIT.match(line)

	simplified, pinyin = d.group(1), d.group(2)
	if (simplified in result):
	## do nothing, we don't want duplicates, fall back to default reading (assumed to be the first entry of the word)
	continue

	## parse pinyin using tone map
	parsed_pinyin = "" ## slowly build it up
	for p in pinyin.split(" "):
	if (len(p) == 1):
	## single character
	parsed_pinyin += " " +p
	continue

	tone = int(p[-1])
	lower_pinyin = p.lower() ## find characters all in lower case form

	if (p[-2] == ":"): ## guaranteed to have two characters at least
	## use ü
	p = p.replace("u", "ü")
	p = p[:-2] +p[-1] ## remove the last second element
	if (tone == 5):
	parsed_pinyin += " " +p[:-1] ## no tone needed, as is (add without the tone declaration at the back)
	continue

	## find vowel (a, e, i, o, u taking precedence)
	## refer to "Rules for placing the tone mark" in https://en.wikipedia.org/wiki/Pinyin
	if (lower_pinyin.find("a") != -1):
	startIndex = lower_pinyin.find("a")
	elif (lower_pinyin.find("e") != -1):
	startIndex = lower_pinyin.find("e")
	elif (lower_pinyin.find("ou") != -1):
	startIndex = lower_pinyin.find("ou")
	else:
	startIndex = -1
	secondIndex = -1 ## if there are two vowels, the second one takes the tone
	for v in ["m", "i", "o", "u", "ü"]: ## m first in order to correctly assign tone for 'mi' and 'mo', making m the first vowel
	if v in lower_pinyin:
	if startIndex == -1:
	startIndex = lower_pinyin.find(v)
	elif secondIndex == -1:
	secondIndex = lower_pinyin.find(v)

	if secondIndex != -1:
	## second vowel takes precedence
	startIndex = secondIndex
	vowel = p[startIndex]
	toReplace = TONE_MAP[vowel.lower()][tone -1]

	## preserve case
	if (vowel.isupper()):
	toReplace = toReplace.upper() ## handled properly even for those unique unicode characters

	p = p.replace(vowel, toReplace)
	parsed_pinyin += " " +p[:-1] ## remove tone declaration at the end

	result[simplified] = parsed_pinyin[1:] ## remove leading whitespace

	## write result into "o.json"
	with open("o.json", "r+", encoding="utf-8") as f:
	f.write(json.dumps(result))
	print("Success!")

	"""
	Testing

	with open("o.json", "r", encoding="utf-8") as f:
	d = json.load(f)
	print("Querying")
	print(d["㺵"])
	"""