Skip to content

Instantly share code, notes, and snippets.

@bemitc
Last active May 3, 2021
Embed
What would you like to do?
#!/usr/bin/env python3
from lxml import html
import requests
from bs4 import BeautifulSoup
import sys
import itertools
import genanki
import glob
import shutil
import os.path
import re
import hashlib
import struct
def writeCsvLine(f, t1, t2):
f.write("\"{}\",\"{}\"\n".format(t1, t2))
origin_language = sys.argv[1].upper()
target_language = sys.argv[2].upper()
url = "https://www.goethe-verlag.com/book2"
target_url = f"{url}/{origin_language}/{origin_language}{target_language}/{origin_language}{target_language}"
def pad_number(n):
if n < 10:
return "00" + str(n)
elif n < 100:
return "0" + str(n)
else:
return str(n)
my_model = genanki.Model(
1091735104,
"50Languages_Import",
fields=[
{"name": "L2"},
{"name": "L1"},
{"name": "Audio"}, # ADD THIS
],
templates=[
{
"name": "Card 1",
"qfmt": "{{L2}}", # AND THIS
"afmt": '{{FrontSide}}<hr id="answer">{{L1}}<br>{{Audio}}',
},
],
css=""".card {
font-family: arial;
font-size: 20px;
text-align: center;
color: black;
background-color: white;
}
.card1 { background-color: #FFFFFF; }
.card2 { background-color: #FFFFFF; }"""
)
# unique id for deck based on language pair and type -- probably overkill
h = hashlib.sha256()
h.update(origin_language.encode())
h.update(target_language.encode())
h.update(b'sentences')
my_deck = genanki.Deck(
struct.unpack("<L", h.digest()[0:4])[0], f"Book2 {origin_language}-{target_language} (sentences)"
)
MIN_LESSON = 3 # 2 is the index page
MAX_LESSON = 102 # 103 is the youtube video
history = {}
f = open(origin_language+target_language+".csv", "wt")
writeCsvLine(f, "Target language", "Native Language")
for i in range(MIN_LESSON, MAX_LESSON + 1):
r = requests.get(f"{target_url}{pad_number(i)}.HTM") # no slash unlike vocab scraping
soup = BeautifulSoup(r.content, "html.parser")
# header
header_l1_sentences = [t.text for t in soup.find_all("span", {"class": "Stil36"})]
header_l2_sentences = [t.text for t in soup.find_all("span", {"class": "Stil46"})]
l2_audio = [t.find_all("source")[0]["src"] for t in soup.find_all("audio")]
body_l1_sentences = [t.text.strip() for t in soup.find_all("div", {"class": "Stil35"})][:18] # last element is some text about Alzheimer
body_l2_sentences = [t.text.strip().split('\r\n\n')[1] for t in soup.find_all("div", {"class": "Stil45"})]
l1_sentences = header_l1_sentences + body_l1_sentences
l2_sentences = header_l2_sentences + body_l2_sentences
for l1_s, l2_s, m in zip(l1_sentences, l2_sentences, l2_audio):
l1_s = l1_s.lstrip()
l1_s = l1_s.rstrip()
l2_s = l2_s.lstrip()
l2_s = l2_s.rstrip()
# patch numbers -- hopefully this is sufficient
if re.match(r"\d{1,} ?\[", l2_s):
l2_s = l2_s.split('[', 1)[1].split(']')[0]
l1_s = l1_s.split('[', 1)[1].split(']')[0]
# avoid duplicates
if l2_s in history:
continue
history[l2_s] = 1
writeCsvLine(f, l2_s, l1_s)
filename = f"sentence_{origin_language}{target_language}_" + m.split("/")[-1]
if not os.path.isfile(filename):
dl_file = requests.get(m, stream=True)
print(m)
with open(filename, "wb") as out_file:
shutil.copyfileobj(dl_file.raw, out_file)
my_note = genanki.Note(
model=my_model, fields=[l2_s, l1_s, f"[sound:{filename}]"]
)
my_deck.add_note(my_note)
f.close()
my_package = genanki.Package(my_deck)
my_package.media_files = [m for m in glob.glob(f"sentence_{origin_language}{target_language}_*.mp3")]
my_package.write_to_file(f"book2_{origin_language}{target_language}_sentences.apkg")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment