| import re | |
| debug = False | |
| file = None | |
| if debug: | |
| file = open('test.txt', 'r') | |
| else: | |
| file = open('core10k.txt', 'r') | |
| out = open("core10kout.txt", "w") | |
| used = [] | |
| knownkanji = open("knownkanji.txt", "r") | |
| for line in knownkanji.readlines(): | |
| used.append(line.replace('\n', '')) | |
| knownkanji.close() | |
| ignored = 0 | |
| loop = 0 | |
| for line in file.readlines(): | |
| if debug: | |
| if loop > 100: | |
| exit() | |
| loop += 1 | |
| elements = line.split("\t") | |
| index = elements[1] | |
| meaning = elements[6] | |
| reading = elements[8] | |
| audio = elements[11] | |
| rtk = elements[22] | |
| if debug: | |
| out.write(reading+"\n") | |
| count = 1 | |
| result = reading.replace(' ', '') | |
| foundany = False | |
| for word in re.findall(ur"[^\s][^\s]*?\[.*?\]", reading): | |
| furigana = re.search(ur"(?:\[)(.*?)(?:\])", word).group(1) | |
| kanji = re.search(ur"(.*?)(?:\[)", word).group(1) | |
| if any(word in s for s in used): | |
| result = result.replace(word, kanji) | |
| else: | |
| foundany = True | |
| if debug: | |
| out.write(furigana+"\n") | |
| out.write(kanji+"\n") | |
| #replstring = "{{c%s::%s}}[{{c%s::%s}}]" % (count + 1, kanji, count, furigana) | |
| replstring = "<ruby title=\"%s(%s)\"><rb><span class=\"clozewrapper\">{{c%s::<span class=\"basemaru\">%s<!-- end_basemaru_l --></span><!-- end_basemaru_r -->}}<!-- clozewr --></span><!-- apper --></rb><rt><span class=\"clozewrapper\">{{c%s::%s}}<!-- clozewr --></span><!-- apper --></rt></ruby>" % (kanji, furigana, count + 1, kanji, count, furigana) | |
| result = result.replace(word, replstring) | |
| count += 2 | |
| used.append(word) | |
| if foundany: | |
| out.write(index + "\t" + result + "\t" + meaning + "\t" + audio + "\t" + rtk + "\t" + reading + "\n") | |
| else: | |
| if debug: | |
| out.write("IGNORED" + result + "\t" + meaning + "\n") | |
| ignored += 1 | |
| print 'ignored ' + str(ignored) | |
| knownkanji = open("knownkanji.txt", "w") | |
| for line in used: | |
| knownkanji.write(line+"\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment