bemitc/booksentences2anki.py

## booksentences2anki.py
#!/usr/bin/env python3

from lxml import html
import requests
from bs4 import BeautifulSoup
import sys
import itertools
import genanki
import glob
import shutil
import os.path
import re
import hashlib
import struct

def writeCsvLine(f, t1, t2):
    f.write("\"{}\",\"{}\"\n".format(t1, t2))


origin_language = sys.argv[1].upper()
target_language = sys.argv[2].upper()

url = "https://www.goethe-verlag.com/book2"

target_url = f"{url}/{origin_language}/{origin_language}{target_language}/{origin_language}{target_language}"

def pad_number(n):
    if n < 10:
        return "00" + str(n)
    elif n < 100:
       return "0" + str(n)
    else:
        return str(n)

my_model = genanki.Model(
    1091735104,
    "50Languages_Import",
    fields=[
        {"name": "L2"},
        {"name": "L1"},
        {"name": "Audio"},  # ADD THIS
    ],
    templates=[
        {
            "name": "Card 1",
            "qfmt": "{{L2}}",  # AND THIS
            "afmt": '{{FrontSide}}<hr id="answer">{{L1}}<br>{{Audio}}',
        },
    ],
    css=""".card {
 font-family: arial;
 font-size: 20px;
 text-align: center;
 color: black;
 background-color: white;
}

.card1 { background-color: #FFFFFF; }
.card2 { background-color: #FFFFFF; }"""
)

# unique id for deck based on language pair and type -- probably overkill
h = hashlib.sha256()
h.update(origin_language.encode())
h.update(target_language.encode())
h.update(b'sentences')

my_deck = genanki.Deck(
    struct.unpack("<L", h.digest()[0:4])[0], f"Book2 {origin_language}-{target_language} (sentences)"
)

MIN_LESSON = 3 # 2 is the index page
MAX_LESSON = 102 # 103 is the youtube video

history = {}
f = open(origin_language+target_language+".csv", "wt")
writeCsvLine(f, "Target language", "Native Language")

for i in range(MIN_LESSON, MAX_LESSON + 1):
    r = requests.get(f"{target_url}{pad_number(i)}.HTM") # no slash unlike vocab scraping
    soup = BeautifulSoup(r.content, "html.parser")

    # header
    header_l1_sentences = [t.text for t in soup.find_all("span", {"class": "Stil36"})]
    header_l2_sentences = [t.text for t in soup.find_all("span", {"class": "Stil46"})]
    l2_audio = [t.find_all("source")[0]["src"] for t in soup.find_all("audio")]

    body_l1_sentences = [t.text.strip() for t in soup.find_all("div", {"class": "Stil35"})][:18] # last element is some text about Alzheimer
    body_l2_sentences = [t.text.strip().split('\r\n\n')[1] for t in soup.find_all("div", {"class": "Stil45"})]

    l1_sentences = header_l1_sentences + body_l1_sentences
    l2_sentences = header_l2_sentences + body_l2_sentences

    for l1_s, l2_s, m in zip(l1_sentences, l2_sentences, l2_audio):

        l1_s = l1_s.lstrip()
        l1_s = l1_s.rstrip()
        l2_s = l2_s.lstrip()
        l2_s = l2_s.rstrip()

        # patch numbers -- hopefully this is sufficient
        if re.match(r"\d{1,} ?\[", l2_s):
            l2_s = l2_s.split('[', 1)[1].split(']')[0]
            l1_s = l1_s.split('[', 1)[1].split(']')[0]

        # avoid duplicates
        if l2_s in history:
            continue

        history[l2_s] = 1

        writeCsvLine(f, l2_s, l1_s)

        filename = f"sentence_{origin_language}{target_language}_" + m.split("/")[-1]

        if not os.path.isfile(filename):
            dl_file = requests.get(m, stream=True)
            print(m)
            with open(filename, "wb") as out_file:
                shutil.copyfileobj(dl_file.raw, out_file)

        my_note = genanki.Note(
            model=my_model, fields=[l2_s, l1_s, f"[sound:{filename}]"]
        )

        my_deck.add_note(my_note)

f.close()
my_package = genanki.Package(my_deck)
my_package.media_files = [m for m in glob.glob(f"sentence_{origin_language}{target_language}_*.mp3")]
my_package.write_to_file(f"book2_{origin_language}{target_language}_sentences.apkg")
	#!/usr/bin/env python3

	from lxml import html
	import requests
	from bs4 import BeautifulSoup
	import sys
	import itertools
	import genanki
	import glob
	import shutil
	import os.path
	import re
	import hashlib
	import struct

	def writeCsvLine(f, t1, t2):
	f.write("\"{}\",\"{}\"\n".format(t1, t2))


	origin_language = sys.argv[1].upper()
	target_language = sys.argv[2].upper()

	url = "https://www.goethe-verlag.com/book2"

	target_url = f"{url}/{origin_language}/{origin_language}{target_language}/{origin_language}{target_language}"

	def pad_number(n):
	if n < 10:
	return "00" + str(n)
	elif n < 100:
	return "0" + str(n)
	else:
	return str(n)

	my_model = genanki.Model(
	1091735104,
	"50Languages_Import",
	fields=[
	{"name": "L2"},
	{"name": "L1"},
	{"name": "Audio"}, # ADD THIS
	],
	templates=[
	{
	"name": "Card 1",
	"qfmt": "{{L2}}", # AND THIS
	"afmt": '{{FrontSide}}<hr id="answer">{{L1}}<br>{{Audio}}',
	},
	],
	css=""".card {
	font-family: arial;
	font-size: 20px;
	text-align: center;
	color: black;
	background-color: white;
	}

	.card1 { background-color: #FFFFFF; }
	.card2 { background-color: #FFFFFF; }"""
	)

	# unique id for deck based on language pair and type -- probably overkill
	h = hashlib.sha256()
	h.update(origin_language.encode())
	h.update(target_language.encode())
	h.update(b'sentences')

	my_deck = genanki.Deck(
	struct.unpack("<L", h.digest()[0:4])[0], f"Book2 {origin_language}-{target_language} (sentences)"
	)

	MIN_LESSON = 3 # 2 is the index page
	MAX_LESSON = 102 # 103 is the youtube video

	history = {}
	f = open(origin_language+target_language+".csv", "wt")
	writeCsvLine(f, "Target language", "Native Language")

	for i in range(MIN_LESSON, MAX_LESSON + 1):
	r = requests.get(f"{target_url}{pad_number(i)}.HTM") # no slash unlike vocab scraping
	soup = BeautifulSoup(r.content, "html.parser")

	# header
	header_l1_sentences = [t.text for t in soup.find_all("span", {"class": "Stil36"})]
	header_l2_sentences = [t.text for t in soup.find_all("span", {"class": "Stil46"})]
	l2_audio = [t.find_all("source")[0]["src"] for t in soup.find_all("audio")]

	body_l1_sentences = [t.text.strip() for t in soup.find_all("div", {"class": "Stil35"})][:18] # last element is some text about Alzheimer
	body_l2_sentences = [t.text.strip().split('\r\n\n')[1] for t in soup.find_all("div", {"class": "Stil45"})]

	l1_sentences = header_l1_sentences + body_l1_sentences
	l2_sentences = header_l2_sentences + body_l2_sentences

	for l1_s, l2_s, m in zip(l1_sentences, l2_sentences, l2_audio):

	l1_s = l1_s.lstrip()
	l1_s = l1_s.rstrip()
	l2_s = l2_s.lstrip()
	l2_s = l2_s.rstrip()

	# patch numbers -- hopefully this is sufficient
	if re.match(r"\d{1,} ?\[", l2_s):
	l2_s = l2_s.split('[', 1)[1].split(']')[0]
	l1_s = l1_s.split('[', 1)[1].split(']')[0]

	# avoid duplicates
	if l2_s in history:
	continue

	history[l2_s] = 1

	writeCsvLine(f, l2_s, l1_s)

	filename = f"sentence_{origin_language}{target_language}_" + m.split("/")[-1]

	if not os.path.isfile(filename):
	dl_file = requests.get(m, stream=True)
	print(m)
	with open(filename, "wb") as out_file:
	shutil.copyfileobj(dl_file.raw, out_file)

	my_note = genanki.Note(
	model=my_model, fields=[l2_s, l1_s, f"[sound:{filename}]"]
	)

	my_deck.add_note(my_note)

	f.close()
	my_package = genanki.Package(my_deck)
	my_package.media_files = [m for m in glob.glob(f"sentence_{origin_language}{target_language}_*.mp3")]
	my_package.write_to_file(f"book2_{origin_language}{target_language}_sentences.apkg")