Skip to content

Instantly share code, notes, and snippets.

@AlexApps99
Created December 14, 2020 05:04
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AlexApps99/7732e746ac791959a46d0a00fc4d5d6a to your computer and use it in GitHub Desktop.
Save AlexApps99/7732e746ac791959a46d0a00fc4d5d6a to your computer and use it in GitHub Desktop.
Generate an Anki deck from a PDF file of Remembering the Kanji 1, 6th edition
#!/usr/bin/env python3
# By AlexApps99
# This Python script will generate an Anki deck from a PDF file of Remembering the Kanji 1, 6th edition.
# To use it, just use the command "python3 create_rtk_deck.py RTK.pdf RTK.apkg"
# Feel free to customize/modify it however you want, public domain code.
# There is unused functionality for getting stories/text too.
# If you have any questions on how to use this feel free to contact me.
# Dependencies:
# - genanki
# - beautifulsoup4
# - Poppler (for pdftohtml command)
import unicodedata
from bs4 import BeautifulSoup
def is_kanji(text):
try:
return unicodedata.name(text).startswith("CJK ") or unicodedata.name(text).startswith("KANGXI ")
except TypeError:
return False
text_fonts = ["6", "7", "11", "12", "16", "25", "28", "29", "33"]
annotation_fonts = ["9", "10", "17", "18", "21", "24", "32"]
def parse(f):
soup = BeautifulSoup(f, "xml")
parsed = []
current = {}
for text in soup.find_all("text"):
t = text.get_text().strip()
if text["font"] == "3" and t.isdecimal():
if current:
parsed.append(current)
current = {}
current["id"] = int(t)
elif text["font"] == "4" or text["font"] == "26":
if "id" in current:
current["keyword"] = t
elif text["font"] == "5" and is_kanji(t):
if "id" in current:
current["kanji"] = t
elif text["font"] in text_fonts:
if "id" in current:
if "text" in current:
current["text"] += " " + t
else:
current["text"] = t
elif text["font"] in annotation_fonts:
if "id" in current:
if "annotation" in current:
current["annotation"] += " " + t
else:
current["annotation"] = t
parsed.append(current)
# Some kanji are not formatted properly and need to be fixed manually
if not "kanji" in parsed[ 128]: parsed[ 128]["kanji"] = "\u55c5"
if not "kanji" in parsed[ 307]: parsed[ 307]["kanji"] = "\u55bb"
if not "kanji" in parsed[ 679]: parsed[ 679]["kanji"] = "\u60e7"
if not "kanji" in parsed[1010]: parsed[1010]["kanji"] = "\u41f3"
if not "kanji" in parsed[1393]: parsed[1393]["kanji"] = "\u9699"
if not "kanji" in parsed[2004]: parsed[2004]["kanji"] = "\u540e"
return parsed
if __name__ == "__main__":
import genanki
import subprocess
from sys import argv
if len(argv) != 3:
print("Usage:", argv[0], "[RTK1 6th edition].pdf [Anki package].apkg")
subprocess.run(["pdftohtml", "-f", "20", "-l", "401", "-q", "-s", "-i", "-xml", argv[1], "rtk.xml"], check=True)
with open("rtk.xml") as f:
p = parse(f)
deck = genanki.Deck(
1196528216,
"RTK1",
"Generated from Remembering the Kanji 1, 6th edition",
)
model = genanki.Model(
1530649655,
"RTK1",
fields=[
{"name": "Index"},
{"name": "Keyword"},
{"name": "Kanji"},
],
templates=[{
"name": "Recall",
"qfmt": "{{Keyword}}",
"afmt": "{{FrontSide}}<hr id=\"answer\">{{Kanji}}<br><small>{{Index}}</small>",
}],
css="",
)
for e in p:
deck.add_note(genanki.Note(
model=model,
fields=[str(e["id"]), e["keyword"], e["kanji"]],
sort_field=e["id"],
))
genanki.Package(deck).write_to_file(argv[2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment