Skip to content

Instantly share code, notes, and snippets.

@Xifax
Created July 14, 2020 17:12
Show Gist options
  • Save Xifax/24630b6478ec2338f139079e2a278102 to your computer and use it in GitHub Desktop.
Save Xifax/24630b6478ec2338f139079e2a278102 to your computer and use it in GitHub Desktop.
Quick and dirty parser for onomatoproject. Compiles list of JP onomatopeia, dumps to json and downloads relevant images.
"""
Parse https://onomatoproject.com/list.html to json dict and download images.
"""
from typing import Dict, Tuple
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import json
BASE = "./images/"
def download_link(link) -> Tuple[str, Dict]:
with requests.get(link) as r:
html = BeautifulSoup(r.content)
kana = html.h1.text
definition = html.find("div", class_="termdefs").text
sentences = []
examples = list(html.find_all("span", class_="standardtext"))
furigana = list(html.find_all("span", class_="furigana"))
translation = list(html.find_all("span", class_="english"))
for i, _ in enumerate(examples):
ruby = [str(r) for r in furigana[i].contents]
sentences.append(
{
"text": examples[i].text,
"ruby": ruby,
"translation": translation[i].text,
}
)
# download image
try:
image = f"https://onomatoproject.com/{html.img.get('src')}"
image_name = image.split("/")[-1].rstrip(".html")
path = Path() / Path(BASE) / image_name
if not path.exists():
with open(str(path), "wb") as f:
f.write(requests.get(image).content)
except AttributeError:
image_name = ""
return (
kana,
{"definition": definition, "examples": sentences, "image": image_name},
)
if __name__ == "__main__":
results = {}
with requests.get("https://onomatoproject.com/list.html") as r:
r.raise_for_status()
soup = BeautifulSoup(r.content)
links = soup.find_all("a", class_="three-col-url")
# Exclude the last one
for link in [l.get("href") for l in links][:-1]:
# Parse each page into sqlite row + image
url = f"https://onomatoproject.com/{link}"
print(url)
entry, data = download_link(url)
results[entry] = data
with open("data.json", "w", encoding="utf-8") as f:
json.dump(results, f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment