Skip to content

Instantly share code, notes, and snippets.

@SecondReality
Created January 22, 2018 03:51
Show Gist options
  • Save SecondReality/6b8da11ef86f5ce5dbbd53eb6305a014 to your computer and use it in GitHub Desktop.
Save SecondReality/6b8da11ef86f5ce5dbbd53eb6305a014 to your computer and use it in GitHub Desktop.
import re
from enum import Enum
import csv
txtFolder = "C:\\FOLDERPATH\\"
filename = "Kanji_Learner_s_Course_Graded_Reading_Sets_vol_1.txt"
class ParseMode(Enum):
ChapterOrEntryLine = 0
ExpandedJapanese = 1
Translation = 2
class Entry(object):
def __init__(self):
self.chapterKanji = ""
self.chapterNumber = -1
self.entryNumber = -1
self.japaneseSentence = ""
self.expandedJapanese = []
self.translation = []
self.expandedTranslation = []
file = open(txtFolder+filename, "r", encoding="utf-8")
entries = []
currentChapter = 0
currentEntryNumber = 0
entry = None
chapterKanji = ""
parseMode = ParseMode.ChapterOrEntryLine
def ps(str):
str = " ".join(str.split())
return str
for rawIndex, rawline in enumerate(file.readlines()):
line = rawline.replace("\f", "")
index = rawIndex+1
if parseMode == ParseMode.ChapterOrEntryLine:
match = re.match(r'(\d+)\s*:(.+)', line)
if match:
chapter = int(match.group(1))
chapterKanji = match.group(2)
assert(currentChapter==chapter or currentChapter+1 == chapter)
currentChapter = chapter
print("Match: " + line)
currentEntryNumber = 0
else:
match = re.match(r'(\d+)-(\d+)\.(.+)', line)
if match:
chapterNumber = int(match.group(1))
entryNumber = int(match.group(2))
japaneseSentence = match.group(3).strip()
assert(currentChapter == chapterNumber), str(currentChapter) + " " + chapterNumber
currentEntryNumber = currentEntryNumber+1
assert entryNumber == currentEntryNumber, f"Entry Number {entryNumber} Current Entry Number {currentEntryNumber} Line {index}"
parseMode = ParseMode.ExpandedJapanese
if entry:
print(f"Entry: {entry.chapterNumber}-{entry.entryNumber} {entry.japaneseSentence} {entry.expandedJapanese} {entry.translation}")
entries.append(entry)
entry = Entry()
entry.chapterKanji = chapterKanji
entry.chapterNumber = chapterNumber
entry.entryNumber = entryNumber
entry.japaneseSentence = japaneseSentence
else:
if entry:
entry.expandedTranslation.append(line)
elif parseMode == ParseMode.ExpandedJapanese:
if len(entry.expandedJapanese)>0:
alphaCount = sum( (c>='a' and c<='z') or (c>='A' and c<='Z') for c in line)
if alphaCount / len(line) > 0.3:
parseMode = ParseMode.Translation
else:
entry.expandedJapanese.append(line)
else:
entry.expandedJapanese.append(line)
if parseMode == ParseMode.Translation:
entry.translation.append(line)
stripped = line.strip()
if len(stripped) > 0:
if stripped[-1] in ".)?!…":
parseMode = ParseMode.ChapterOrEntryLine
entries.append(entry)
with open('klc1.csv', 'w', newline='', encoding="utf-8") as csvfile:
csvWriter = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_ALL)
for entry in entries:
csvWriter.writerow([("%04d" % (entry.chapterNumber,))+("%04d" % (entry.entryNumber,)), entry.chapterKanji, entry.chapterNumber, entry.entryNumber, ps(entry.japaneseSentence), ps(''.join(entry.expandedJapanese)), ps(''.join(entry.translation)), ps(''.join(entry.expandedTranslation))])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment