Skip to content

Instantly share code, notes, and snippets.

@WydD
Created March 25, 2017 20:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save WydD/957d1149fa503d9853cff969a332d9c2 to your computer and use it in GitHub Desktop.
Save WydD/957d1149fa503d9853cff969a332d9c2 to your computer and use it in GitHub Desktop.
jisho-pull.py
import json
import requests
import time
kana = "あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよらりるれろわがぎぐげござじずぜぞだぢづでどばびぶべぼぱぴぷぺぽっ"
# Load data from jisho
# You cant go beyond page 100 so I'm spliting the dictionary with kana prefixes
content = []
for k in kana:
i=1
while True:
print(k, i)
data = json.loads(requests.get("http://jisho.org/api/v1/search/words", params={"keyword":"#common "+k+"*", "page":i}).text)["data"]
if len(data) == 0:
break
i += 1
content += data
time.sleep(1)
#dump the raw data as a backup
import pickle
pickle.dump(content, open("jisho-common.dump", "wb"))
def get_word(w):
w = w["japanese"][0]
if "word" in w:
return w["word"]
return w["reading"]
# Deduplicate as one word may be on several kana prefix (due to alternative forms of words)
all_words = set()
common_words = {get_word(w):w for w in content if get_word(w) not in all_words}
import urllib.parse
import csv
def as_csv_entry(c):
w = get_word(c)
tags = [int(t[8:]) for t in c["tags"] if "wanikani" in t]
wanikani = min(tags) if len(tags) > 0 else 100
reading = c["japanese"][0]["reading"]
sense_0 = ", ".join(c["senses"][0]["english_definitions"])
sense_1 = None if len(c["senses"]) <= 1 else ", ".join(c["senses"][1]["english_definitions"])
sense_2 = None if len(c["senses"]) <= 2 else ", ".join(c["senses"][2]["english_definitions"])
return [wanikani, w, reading, sense_0, sense_1, sense_2, "http://jisho.org/word/"+urllib.parse.quote_plus(w)]
# export as csv
with open("common-words.csv" , "w", encoding="utf8", newline='') as f:
w = csv.writer(f)
w.writerow(["wanikani", "word", "reading", "sense 1", "sense 2", "sense 3", "url"])
for e in common_words.values():
w.writerow(as_csv_entry(e))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment