Skip to content

Instantly share code, notes, and snippets.

@AriaFallah
Last active September 19, 2022 02:15
Show Gist options
  • Save AriaFallah/abe55e998569e0bd2993b297162f1db2 to your computer and use it in GitHub Desktop.
Save AriaFallah/abe55e998569e0bd2993b297162f1db2 to your computer and use it in GitHub Desktop.
Spanish Anki Deck Scraping Code
// https://www.spanish.academy/blog/1000-most-common-spanish-words-for-beginners/
function scrape() {
const words = [];
const parent = document.querySelectorAll(".blog_content")[0];
let currentSection = [null, null];
let currentSubsection = [null, null];
for (const child of parent.children) {
switch (child.tagName.toLowerCase()) {
case "h2": {
const { innerText } = child;
currentSection = [
innerText.trim(),
innerText.toLowerCase().replace(/ /g, "_"),
];
currentSubsection = [null, null];
break;
}
case "h3": {
const { innerText } = child;
currentSubsection = [
innerText.trim(),
innerText.toLowerCase().replace(/ /g, "_"),
];
break;
}
case "ul": {
if (currentSection[1].includes("ready_to")) {
continue;
}
for (const item of child.children) {
let [word, definition] = item.innerText
.replace(/[—–]/g, "#")
.split("#");
words.push(
[
word.trim(),
definition.trim(),
[currentSection[0], currentSubsection[0]]
.filter(Boolean)
.join(" - "),
[currentSection[1], currentSubsection[1]]
.filter(Boolean)
.join(" "),
].join(";")
);
}
break;
}
default:
break;
}
}
console.log(words.join("\n"));
}
scrape();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment