-
-
Save WydD/957d1149fa503d9853cff969a332d9c2 to your computer and use it in GitHub Desktop.
jisho-pull.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import requests | |
import time | |
kana = "あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよらりるれろわがぎぐげござじずぜぞだぢづでどばびぶべぼぱぴぷぺぽっ" | |
# Load data from jisho | |
# You cant go beyond page 100 so I'm spliting the dictionary with kana prefixes | |
content = [] | |
for k in kana: | |
i=1 | |
while True: | |
print(k, i) | |
data = json.loads(requests.get("http://jisho.org/api/v1/search/words", params={"keyword":"#common "+k+"*", "page":i}).text)["data"] | |
if len(data) == 0: | |
break | |
i += 1 | |
content += data | |
time.sleep(1) | |
#dump the raw data as a backup | |
import pickle | |
pickle.dump(content, open("jisho-common.dump", "wb")) | |
def get_word(w): | |
w = w["japanese"][0] | |
if "word" in w: | |
return w["word"] | |
return w["reading"] | |
# Deduplicate as one word may be on several kana prefix (due to alternative forms of words) | |
all_words = set() | |
common_words = {get_word(w):w for w in content if get_word(w) not in all_words} | |
import urllib.parse | |
import csv | |
def as_csv_entry(c): | |
w = get_word(c) | |
tags = [int(t[8:]) for t in c["tags"] if "wanikani" in t] | |
wanikani = min(tags) if len(tags) > 0 else 100 | |
reading = c["japanese"][0]["reading"] | |
sense_0 = ", ".join(c["senses"][0]["english_definitions"]) | |
sense_1 = None if len(c["senses"]) <= 1 else ", ".join(c["senses"][1]["english_definitions"]) | |
sense_2 = None if len(c["senses"]) <= 2 else ", ".join(c["senses"][2]["english_definitions"]) | |
return [wanikani, w, reading, sense_0, sense_1, sense_2, "http://jisho.org/word/"+urllib.parse.quote_plus(w)] | |
# export as csv | |
with open("common-words.csv" , "w", encoding="utf8", newline='') as f: | |
w = csv.writer(f) | |
w.writerow(["wanikani", "word", "reading", "sense 1", "sense 2", "sense 3", "url"]) | |
for e in common_words.values(): | |
w.writerow(as_csv_entry(e)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment