Skip to content

Instantly share code, notes, and snippets.

@kanjieater
Created March 22, 2021 01:56
Show Gist options
  • Save kanjieater/9ee06dba78fa323870e4b9b876039e80 to your computer and use it in GitHub Desktop.
Save kanjieater/9ee06dba78fa323870e4b9b876039e80 to your computer and use it in GitHub Desktop.
Create Sentence Cards from Books (WIP) - Better japanese parsing
import sys
import io
import os
from aqt import mw
from aqt.utils import getFile, tooltip
from aqt.qt import *
sys.path.append(os.path.join(os.path.dirname(__file__), "vendor"))
'''
from importlib import reload
gd = __import__('1200382751')
reload(gd)
'''
def main():
file = getFile(mw, "Import", None, filter="*.txt", key="1200382751")
if not file:
return
# mw.progress.start(immediate=True)
import nltk
with open(file, 'r', encoding='utf-8') as f_txt:
text = f_txt.read()
mw.progress.start(label='Parsing Sentences...\n ', max=len(text.splitlines()), min=1, immediate=True)
cnt = 0
lines = text.splitlines()
for i, p in enumerate(lines):
mw.progress.update(value=cnt+1, label=f'Parsing Sentences...\n{int(i)}/{len(lines)} lines')
if not p.strip():
continue
tokenizer = nltk.RegexpTokenizer(u'[^ 「」!?。.)]*[!?。]')
# sents = nltk.sent_tokenize(p)
sents = tokenizer.tokenize(p)
filename, ext = os.path.splitext(os.path.basename(file))
did = mw.col.decks.id(f'!優先::Y メディア::本::{filename}')
mw.col.decks.select(did)
m = mw.col.models.byName("Japanese")
deck = mw.col.decks.get(did)
deck['mid'] = m['id']
mw.col.decks.save(deck)
m['did'] = did
mw.col.models.save(m)
for s in sents:
n = mw.col.newNote()
n["Expression"] = s.strip()
n["Meta"] = f'{i}/{len(lines)}'
n.addTag('mm')
mw.col.addNote(n)
cids = [c.id for c in n.cards()]
mw.col.sched.suspendCards(cids)
cnt += 1
mw.progress.finish()
mw.reset()
tooltip("Imported {} notes.".format(cnt))
action = QAction("Create Sentence Cards", mw)
action.triggered.connect(main)
mw.form.menuTools.addAction(action)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment