Skip to content

Instantly share code, notes, and snippets.

@dlip
Last active August 29, 2015 14:02
Show Gist options
  • Save dlip/514218da0aa3975fc383 to your computer and use it in GitHub Desktop.
Save dlip/514218da0aa3975fc383 to your computer and use it in GitHub Desktop.
Generates cloze for japanese from regular reading card
import re
debug = False
file = None
if debug:
file = open('test.txt', 'r')
else:
file = open('core10k.txt', 'r')
out = open("core10kout.txt", "w")
used = []
knownkanji = open("knownkanji.txt", "r")
for line in knownkanji.readlines():
used.append(line.replace('\n', ''))
knownkanji.close()
ignored = 0
loop = 0
for line in file.readlines():
if debug:
if loop > 100:
exit()
loop += 1
elements = line.split("\t")
index = elements[1]
meaning = elements[6]
reading = elements[8]
audio = elements[11]
rtk = elements[22]
if debug:
out.write(reading+"\n")
count = 1
result = reading
foundany = False
for word in re.findall(ur"[^\s][^\s]*?\[.*?\]", reading):
if not any(word in s for s in used):
foundany = True
furigana = re.search(ur"(?:\[)(.*?)(?:\])", word).group(1)
kanji = re.search(ur"(.*?)(?:\[)", word).group(1)
if debug:
out.write(furigana+"\n")
out.write(kanji+"\n")
replstring = "{{c%s::%s}}[{{c%s::%s}}]" % (count + 1, kanji, count, furigana)
result = result.replace(word, replstring)
count += 2
used.append(word)
if foundany:
out.write(index + "\t" + result + "\t" + meaning + "\t" + audio + "\t" + rtk + "\t" + reading + "\n")
else:
if debug:
out.write("IGNORED" + result + "\t" + meaning + "\n")
ignored += 1
print 'ignored ' + str(ignored)
knownkanji = open("knownkanji.txt", "w")
for line in used:
knownkanji.write(line+"\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment