AlexApps99/create_rtk_deck.py

## create_rtk_deck.py
#!/usr/bin/env python3
# By AlexApps99
# This Python script will generate an Anki deck from a PDF file of Remembering the Kanji 1, 6th edition.
# To use it, just use the command "python3 create_rtk_deck.py RTK.pdf RTK.apkg"
# Feel free to customize/modify it however you want, public domain code.
# There is unused functionality for getting stories/text too.
# If you have any questions on how to use this feel free to contact me.
# Dependencies:
# - genanki
# - beautifulsoup4
# - Poppler (for pdftohtml command)
import unicodedata
from bs4 import BeautifulSoup

def is_kanji(text):
    try:
        return unicodedata.name(text).startswith("CJK ") or unicodedata.name(text).startswith("KANGXI ")
    except TypeError:
        return False

text_fonts = ["6", "7", "11", "12", "16", "25", "28", "29", "33"]
annotation_fonts = ["9", "10", "17", "18", "21", "24", "32"]

def parse(f):
    soup = BeautifulSoup(f, "xml")
    parsed = []
    current = {}
    for text in soup.find_all("text"):
        t = text.get_text().strip()
        if text["font"] == "3" and t.isdecimal():
            if current:
                parsed.append(current)
                current = {}
            current["id"] = int(t)
        elif text["font"] == "4" or text["font"] == "26":
            if "id" in current:
                current["keyword"] = t
        elif text["font"] == "5" and is_kanji(t):
            if "id" in current:
                current["kanji"] = t
        elif text["font"] in text_fonts:
            if "id" in current:
                if "text" in current:
                    current["text"] += " " + t
                else:
                    current["text"] = t
        elif text["font"] in annotation_fonts:
            if "id" in current:
                if "annotation" in current:
                    current["annotation"] += " " + t
                else:
                    current["annotation"] = t
    parsed.append(current)
    # Some kanji are not formatted properly and need to be fixed manually
    if not "kanji" in parsed[ 128]: parsed[ 128]["kanji"] = "\u55c5"
    if not "kanji" in parsed[ 307]: parsed[ 307]["kanji"] = "\u55bb"
    if not "kanji" in parsed[ 679]: parsed[ 679]["kanji"] = "\u60e7"
    if not "kanji" in parsed[1010]: parsed[1010]["kanji"] = "\u41f3"
    if not "kanji" in parsed[1393]: parsed[1393]["kanji"] = "\u9699"
    if not "kanji" in parsed[2004]: parsed[2004]["kanji"] = "\u540e"

    return parsed


if __name__ == "__main__":
    import genanki
    import subprocess
    from sys import argv
    if len(argv) != 3:
        print("Usage:", argv[0], "[RTK1 6th edition].pdf [Anki package].apkg")
    subprocess.run(["pdftohtml", "-f", "20", "-l", "401", "-q", "-s", "-i", "-xml", argv[1], "rtk.xml"], check=True)
    with open("rtk.xml") as f:
        p = parse(f)
    deck = genanki.Deck(
        1196528216,
        "RTK1",
        "Generated from Remembering the Kanji 1, 6th edition",
    )
    model = genanki.Model(
        1530649655,
        "RTK1",
        fields=[
            {"name": "Index"},
            {"name": "Keyword"},
            {"name": "Kanji"},
        ],
        templates=[{
            "name": "Recall",
            "qfmt": "{{Keyword}}",
            "afmt": "{{FrontSide}}<hr id=\"answer\">{{Kanji}}<br><small>{{Index}}</small>",
        }],
        css="",
    )
    for e in p:
        deck.add_note(genanki.Note(
            model=model,
            fields=[str(e["id"]), e["keyword"], e["kanji"]],
            sort_field=e["id"],
        ))
    genanki.Package(deck).write_to_file(argv[2])
	#!/usr/bin/env python3
	# By AlexApps99
	# This Python script will generate an Anki deck from a PDF file of Remembering the Kanji 1, 6th edition.
	# To use it, just use the command "python3 create_rtk_deck.py RTK.pdf RTK.apkg"
	# Feel free to customize/modify it however you want, public domain code.
	# There is unused functionality for getting stories/text too.
	# If you have any questions on how to use this feel free to contact me.
	# Dependencies:
	# - genanki
	# - beautifulsoup4
	# - Poppler (for pdftohtml command)
	import unicodedata
	from bs4 import BeautifulSoup

	def is_kanji(text):
	try:
	return unicodedata.name(text).startswith("CJK ") or unicodedata.name(text).startswith("KANGXI ")
	except TypeError:
	return False

	text_fonts = ["6", "7", "11", "12", "16", "25", "28", "29", "33"]
	annotation_fonts = ["9", "10", "17", "18", "21", "24", "32"]

	def parse(f):
	soup = BeautifulSoup(f, "xml")
	parsed = []
	current = {}
	for text in soup.find_all("text"):
	t = text.get_text().strip()
	if text["font"] == "3" and t.isdecimal():
	if current:
	parsed.append(current)
	current = {}
	current["id"] = int(t)
	elif text["font"] == "4" or text["font"] == "26":
	if "id" in current:
	current["keyword"] = t
	elif text["font"] == "5" and is_kanji(t):
	if "id" in current:
	current["kanji"] = t
	elif text["font"] in text_fonts:
	if "id" in current:
	if "text" in current:
	current["text"] += " " + t
	else:
	current["text"] = t
	elif text["font"] in annotation_fonts:
	if "id" in current:
	if "annotation" in current:
	current["annotation"] += " " + t
	else:
	current["annotation"] = t
	parsed.append(current)
	# Some kanji are not formatted properly and need to be fixed manually
	if not "kanji" in parsed[ 128]: parsed[ 128]["kanji"] = "\u55c5"
	if not "kanji" in parsed[ 307]: parsed[ 307]["kanji"] = "\u55bb"
	if not "kanji" in parsed[ 679]: parsed[ 679]["kanji"] = "\u60e7"
	if not "kanji" in parsed[1010]: parsed[1010]["kanji"] = "\u41f3"
	if not "kanji" in parsed[1393]: parsed[1393]["kanji"] = "\u9699"
	if not "kanji" in parsed[2004]: parsed[2004]["kanji"] = "\u540e"

	return parsed


	if __name__ == "__main__":
	import genanki
	import subprocess
	from sys import argv
	if len(argv) != 3:
	print("Usage:", argv[0], "[RTK1 6th edition].pdf [Anki package].apkg")
	subprocess.run(["pdftohtml", "-f", "20", "-l", "401", "-q", "-s", "-i", "-xml", argv[1], "rtk.xml"], check=True)
	with open("rtk.xml") as f:
	p = parse(f)
	deck = genanki.Deck(
	1196528216,
	"RTK1",
	"Generated from Remembering the Kanji 1, 6th edition",
	)
	model = genanki.Model(
	1530649655,
	"RTK1",
	fields=[
	{"name": "Index"},
	{"name": "Keyword"},
	{"name": "Kanji"},
	],
	templates=[{
	"name": "Recall",
	"qfmt": "{{Keyword}}",
	"afmt": "{{FrontSide}}<hr id=\"answer\">{{Kanji}}<br><small>{{Index}}</small>",
	}],
	css="",
	)
	for e in p:
	deck.add_note(genanki.Note(
	model=model,
	fields=[str(e["id"]), e["keyword"], e["kanji"]],
	sort_field=e["id"],
	))
	genanki.Package(deck).write_to_file(argv[2])