Skip to content

Instantly share code, notes, and snippets.

@bemitc
Last active February 18, 2022 21:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bemitc/72c1e527238082013c55c6b3d7199fc0 to your computer and use it in GitHub Desktop.
Save bemitc/72c1e527238082013c55c6b3d7199fc0 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
from lxml import html
import requests
from bs4 import BeautifulSoup
import sys
import itertools
import genanki
import glob
import shutil
import os.path
import re
import hashlib
import struct
def writeCsvLine(f, t1, t2):
f.write("\"{}\",\"{}\"\n".format(t1, t2))
origin_language = sys.argv[1].upper()
target_language = sys.argv[2].upper()
url = "https://www.goethe-verlag.com/book2"
target_url = f"{url}/{origin_language}/{origin_language}{target_language}/{origin_language}{target_language}"
def pad_number(n):
if n < 10:
return "00" + str(n)
elif n < 100:
return "0" + str(n)
else:
return str(n)
my_model = genanki.Model(
1091735104,
"50Languages_Import",
fields=[
{"name": "L2"},
{"name": "L1"},
{"name": "Audio"}, # ADD THIS
],
templates=[
{
"name": "Card 1",
"qfmt": "{{L2}}", # AND THIS
"afmt": '{{FrontSide}}<hr id="answer">{{L1}}<br>{{Audio}}',
},
],
css=""".card {
font-family: arial;
font-size: 20px;
text-align: center;
color: black;
background-color: white;
}
.card1 { background-color: #FFFFFF; }
.card2 { background-color: #FFFFFF; }"""
)
# unique id for deck based on language pair and type -- probably overkill
h = hashlib.sha256()
h.update(origin_language.encode())
h.update(target_language.encode())
h.update(b'sentences')
my_deck = genanki.Deck(
struct.unpack("<L", h.digest()[0:4])[0], f"Book2 {origin_language}-{target_language} (sentences)"
)
MIN_LESSON = 3 # 2 is the index page
MAX_LESSON = 102 # 103 is the youtube video
history = {}
f = open(origin_language+target_language+".csv", "wt")
writeCsvLine(f, "Target language", "Native Language")
for i in range(MIN_LESSON, MAX_LESSON + 1):
r = requests.get(f"{target_url}{pad_number(i)}.HTM") # no slash unlike vocab scraping
soup = BeautifulSoup(r.content, "html.parser")
# header
header_l1_sentences = [t.text for t in soup.find_all("span", {"class": "Stil36"})]
header_l2_sentences = [t.text for t in soup.find_all("span", {"class": "Stil46"})]
l2_audio = [t.find_all("source")[0]["src"] for t in soup.find_all("audio")]
body_l1_sentences = [t.text.strip() for t in soup.find_all("div", {"class": "Stil35"})][:18] # last element is some text about Alzheimer
body_l2_sentences = [t.text.strip().split('\r\n\n')[1] for t in soup.find_all("div", {"class": "Stil45"})]
l1_sentences = header_l1_sentences + body_l1_sentences
l2_sentences = header_l2_sentences + body_l2_sentences
for l1_s, l2_s, m in zip(l1_sentences, l2_sentences, l2_audio):
l1_s = l1_s.lstrip()
l1_s = l1_s.rstrip()
l2_s = l2_s.lstrip()
l2_s = l2_s.rstrip()
# patch numbers -- hopefully this is sufficient
if re.match(r"\d{1,} ?\[", l2_s):
l2_s = l2_s.split('[', 1)[1].split(']')[0]
l1_s = l1_s.split('[', 1)[1].split(']')[0]
# avoid duplicates
if l2_s in history:
continue
history[l2_s] = 1
writeCsvLine(f, l2_s, l1_s)
filename = f"sentence_{origin_language}{target_language}_" + m.split("/")[-1]
if not os.path.isfile(filename):
dl_file = requests.get(m, stream=True)
print(m)
with open(filename, "wb") as out_file:
shutil.copyfileobj(dl_file.raw, out_file)
my_note = genanki.Note(
model=my_model, fields=[l2_s, l1_s, f"[sound:{filename}]"]
)
my_deck.add_note(my_note)
f.close()
my_package = genanki.Package(my_deck)
my_package.media_files = [m for m in glob.glob(f"sentence_{origin_language}{target_language}_*.mp3")]
my_package.write_to_file(f"book2_{origin_language}{target_language}_sentences.apkg")
@dennisAlexander
Copy link

great script! do i just input the http request parameters between 'target_url's curly braces and run it?

@bemitc
Copy link
Author

bemitc commented Oct 12, 2021

If you need additional http parameters, you can enter them after the url in the requests.get call. You do need to provide the 2 character language codes for origin and target language which is generate a path (eg: https://www.goethe-verlag.com/book2/EM/EMJA/EMJA003.HTM). Admittedly, I haven't touched this since May and was mostly written because I don't like production decks or isolated vocabulary decks.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment