-
-
Save aquinzi/8c9a133d3658c6d9ea62301fa93a4983 to your computer and use it in GitHub Desktop.
Get kanji information from a list getting data from jisho.org
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf_8 -*- | |
# | |
# Get kanji information from a list getting data from jisho.org (2017-01-12) | |
# Orders the final list by stroke count | |
# | |
# File must be: one kanji per line | |
# | |
# the frequency (newspaper) x/2500 (low = popular; ex. 日 is 1, in tagaini is 2501) | |
# FIX: better way to handle file names | |
# FIX: better way to handle temporal file when almost finishing (load and merge with current batch) | |
#TODO: more flexible when sorting, add: | |
# - JLPT level | |
# - frequency | |
# - Heisig | |
# - grade | |
# TODO: Allow sub sorting (ex: jlpt level -> stroke count) | |
# FIX: still saving duplicates when using temporary file | |
# print something else instead of line ID, or try to get them in order | |
#FIX: avoid saving "empty" file for temporal (only saves heading) | |
import os | |
import sys | |
import codecs | |
try: | |
import requests | |
except Exception as e: | |
print(" must install python package: requests") | |
exit() | |
try: | |
from bs4 import BeautifulSoup | |
except Exception as e: | |
print(" must install python package: BeautifulSoup") | |
exit() | |
#for fun, use named tuples | |
from collections import namedtuple | |
# -------------- | |
# CONFIG | |
# -------------- | |
JISHO_URL_KANJI_SEARCH = "http://jisho.org/search/%23kanji%20{kanji}" | |
LINE_BREAK_CSV = "<br>" #<br>for anki | |
FURIGANA = ('【','】') #jisho.org uses 【】, can change it to {} for example | |
SAVE_EVERY_ITEMS = 25 | |
# -------------- | |
# END CONFIG | |
# -------------- | |
def save_file(is_part=False): | |
# TODO: should be in another way but whatever | |
final_text = list() | |
for item in KANJI_LIST: | |
if not item: #just in case | |
continue | |
final_text.append("\t".join(item)) | |
tmp = 'kanji meaning_en strokes radicals components joyo grade jlpt frequency is_word compounds meaning_sp rtk rtk6 reading'.replace(" ", "\t") | |
final_text = tmp + "\n" + "\n".join(final_text) | |
save_mode = 'w' | |
# TODO: better naming! | |
final_output_file = "complete-" + INPUT_FILE | |
if is_part: | |
final_output_file += ".part" | |
save_mode = 'a' | |
with open(final_output_file, 'w', encoding='utf-8') as output_file: | |
output_file.write(final_text) | |
if not is_part: | |
# delete temporal file | |
final_output_file += ".part" | |
if os.path.exists(final_output_file): | |
os.remove(final_output_file) | |
def get_kanji_info(kanji_search): | |
# query and get info | |
r = requests.get(JISHO_URL_KANJI_SEARCH.format(kanji=kanji_search)) | |
# put it in a nice format | |
# - dictionary indices:we only care: | |
# - Remembering The Kanji (common) | |
# - Remembering The Kanji, 6th edition (common) | |
# for the vocabulary if we query it (also could work for our vocabulary deck): | |
# - word | |
# - reading | |
# - tags (common, jlpt, wanikani) | |
# - link to audio | |
# - definition | |
# - example sentences | |
if not r.text: | |
print (" page not found for " + kanji) | |
return False | |
#start parsing kanji data | |
soup = BeautifulSoup(r.text).find(id="result_area") | |
stroke_count = "" | |
try: | |
soup.find(class_="kanji-details__stroke_count").strong | |
except Exception as e: | |
KANJI_RETRY.append(current_kanji) | |
return False, "stroke count" | |
else: | |
stroke_count = soup.find(class_="kanji-details__stroke_count").strong.string | |
radicals = "" | |
components = "" #in csv | |
for parts in soup.find_all(class_="radicals"): | |
if parts.dl.dt.string == "Radical:": | |
parts.dl.dd.span.find(class_="radical_meaning").extract() | |
radicals = parts.dl.dd.get_text(",", strip=True) | |
elif parts.dl.dt.string == "Parts:": | |
components = parts.dl.dd.get_text(",", strip=True) | |
meaning_english = "" | |
meaning_english = soup.find(class_="kanji-details__main-meanings").get_text(",", strip=True) | |
readings = "" | |
readings = soup.find(class_="kanji-details__main-readings").get_text(",", strip=True) | |
kanji_stats = soup.find(class_="kanji_stats") | |
if not kanji_stats: | |
return False, "kanji stats" | |
taught_grade = "" | |
try: | |
kanji_stats.find(class_="grade").strong.string | |
except Exception as e: | |
taught_grade = '"-"' | |
#return False, "grade" | |
else: | |
taught_grade = kanji_stats.find(class_="grade").strong.string | |
is_joyo_kanji = False | |
for tmp in kanji_stats.find(class_="grade").stripped_strings: | |
if (tmp.startswith("Jōyō kanji")): | |
is_joyo_kanji = True | |
break | |
jlpt_level = "" | |
try: | |
kanji_stats.find(class_="jlpt").strong | |
except Exception as e: | |
jlpt_level = '"N0"' | |
else: | |
jlpt_level = kanji_stats.find(class_="jlpt").strong.string | |
frequency = "" | |
try: | |
kanji_stats.find(class_="frequency").strong | |
except Exception as e: | |
#return False, "frequency" | |
frequency = '"9999"' | |
else: | |
frequency = kanji_stats.find(class_="frequency").strong.string | |
compounds_on = list() | |
compounds_kun = list() | |
is_word_alone = False | |
compounds = "" | |
compounds = soup.find("div", class_='compounds').find_all("div") | |
heading_key = "" | |
for box in compounds: | |
if box.h2.string.startswith("On "): | |
heading_key= "on" | |
elif box.h2.string.startswith("Kun "): | |
heading_key= "kun" | |
for li in box.ul.stripped_strings: | |
tmp = li.replace("\n"," ").replace(" ","") #very lame way to do that | |
tmp = tmp.replace("【",FURIGANA[0]).replace("】",FURIGANA[1] + " ") | |
if tmp.startswith(current_kanji + FURIGANA[0]): | |
is_word_alone = True | |
if heading_key == "on": | |
compounds_on.append(tmp) | |
elif heading_key == "kun": | |
compounds_kun.append(tmp) | |
spanish_meanings = "" | |
spanish_meanings = soup.find(class_="spanish_meanings").ul.get_text(", ", strip=True) | |
book_indices = {'RTK':'-','RTK6':'-'} #easier to get the book_id | |
for row in soup.find("section", id="indices").find_all("tr"): | |
current_row = row.find_all("td") | |
book_title = current_row[1].string.strip() | |
# row.find_all("td") so we can use array for the two columns | |
if not book_title in ( | |
#"Japanese for Busy People", | |
"Remembering The Kanji (James Heisig)", | |
"Remembering The Kanji, 6th edition (James Heisig)", | |
#"A Guide To Reading and Writing Japanese 3rd edition (Henshall, Seeley and De Groot)", | |
#"A Guide To Remembering Japanese Characters (Kenneth G. Henshall)" | |
): | |
continue | |
else: | |
book_title = book_title.replace(" (James Heisig)", "").replace( | |
" (Henshall, Seeley and De Groot)","" | |
).replace( "(Kenneth G. Henshall)","") | |
book_title = book_title.replace("Remembering The Kanji","RTK").replace(", 6th edition","6") | |
book_indices[book_title] = current_row[0].string.strip() | |
return Kanji( | |
kanji=current_kanji | |
, meaning_en='"' + meaning_english + '"' | |
, strokes=stroke_count | |
, radicals=radicals | |
, components='"' + components + '"' | |
, joyo='yes' if is_joyo_kanji else 'no' | |
, grade=taught_grade | |
, jlpt=jlpt_level | |
, frequency=frequency | |
, compounds='"' + LINE_BREAK_CSV.join(compounds_on) + LINE_BREAK_CSV + LINE_BREAK_CSV.join(compounds_kun) + '"' | |
, meaning_sp='"' + spanish_meanings + '"' | |
, rtk6=book_indices["RTK6"] | |
, rtk=book_indices["RTK"] | |
, is_word='yes' if is_word_alone else 'no' | |
, readings=readings | |
), True | |
Kanji = namedtuple('Kanji', 'kanji meaning_en strokes radicals components joyo grade jlpt frequency is_word compounds meaning_sp rtk rtk6 readings') | |
# parse args | |
if len(sys.argv) < 2: | |
print(" Must specify input") | |
exit() | |
if "-h" in sys.argv or "help" in sys.argv: | |
print ("\n Specify input_file [options]") | |
print ("\n options are: ") | |
print (" --start= useful if there was some error and the script had to end") | |
exit() | |
args = sys.argv[1:] | |
INPUT_FILE = args[0] | |
START_FROM_LINE = 0 | |
if not os.path.exists(INPUT_FILE): | |
print (" Input file doesnt exist") | |
exit() | |
if "--start=" in " ".join(args): | |
tmp = " ".join(args).split("--start=") | |
START_FROM_LINE = tmp[1].split()[0] | |
try: | |
int(START_FROM_LINE) | |
except Exception as e: | |
print (" converting string to number in the argument failed.") | |
else: | |
START_FROM_LINE = int(START_FROM_LINE) | |
# ----------- | |
# Start! | |
# ----------- | |
# collect data | |
print(" Reading file... ") | |
KANJI_LIST = list() # holds tuples | |
text_list = list() | |
with codecs.open(INPUT_FILE, 'r', encoding='utf-8-sig') as filee: | |
text_list = filee.read().splitlines() | |
if not text_list: | |
print (" empty file") | |
exit() | |
text_list = tuple(set(text_list)) #remove possible duplicates | |
print (" Total lines: " , len(text_list)) | |
KANJI_RETRY = list() #because sometimes it works and sometimes it doesnt.... | |
temporal_list_before = list() | |
if START_FROM_LINE > 0: | |
START_FROM_LINE -= 1 | |
#must get old list (temporal) otherwise it gets rewritten | |
if os.path.exists("complete-" + INPUT_FILE + ".part"): | |
temporal_list_before = list() | |
with codecs.open("complete-" + INPUT_FILE + ".part", 'r', encoding='utf-8-sig') as filee: | |
temporal_list_before = filee.read().splitlines() | |
temporal_list_before = temporal_list_before[1:] #first one holds header | |
for line in temporal_list_before: | |
tmp = line.split("\t") | |
KANJI_LIST.append( | |
Kanji( | |
kanji=tmp[0] | |
, meaning_en=tmp[1] | |
, strokes=tmp[2] | |
, radicals=tmp[3] | |
, components=tmp[4] | |
, joyo=tmp[5] | |
, grade=tmp[6] | |
, jlpt=tmp[7] | |
, frequency=tmp[8] | |
, is_word=tmp[9] | |
, compounds=tmp[10] | |
, meaning_sp=tmp[11] | |
, rtk6=tmp[12] | |
, rtk=tmp[13] | |
, readings=tmp[14] | |
) | |
) | |
for line_number, line in enumerate(text_list[START_FROM_LINE:]): | |
current_kanji = line.strip().strip('"') | |
real_line_number = START_FROM_LINE + line_number + 1 | |
print (" processing line:", (real_line_number) ) | |
if not current_kanji: | |
continue | |
result, why_skip = get_kanji_info(current_kanji) | |
if not result: | |
print (" something wrong with the kanji on line: {line} - {why}".format(line=real_line_number, why=why_skip) ) | |
KANJI_RETRY.append(current_kanji) | |
continue | |
KANJI_LIST.append(result) | |
if ( | |
(real_line_number == SAVE_EVERY_ITEMS) or (real_line_number%SAVE_EVERY_ITEMS == 0) | |
): | |
#merge both lists now and forget later | |
if (temporal_list_before): | |
KANJI_LIST += temporal_list_before | |
temporal_list_before = list() | |
print(" .... saving temporal file on line " , real_line_number) | |
save_file(is_part=True) | |
# go to missed kanji list | |
if KANJI_RETRY: | |
print (" Retrying some kanji that got errors somewhere along the way") | |
print(" .... saving temporal file on line " , real_line_number) | |
save_file(is_part=True) | |
KANJI_RETRY_done = list() | |
for line_number, line in enumerate(KANJI_RETRY_original): | |
current_kanji = line | |
print (" processing l:", (line_number + 1) ) | |
if not current_kanji: | |
continue | |
if current_kanji in KANJI_RETRY_done: | |
continue | |
result, why_skip = get_kanji_info(current_kanji) | |
if not result: | |
print (" something wrong with the kanji on line: {line} - {why}".format(line=line_number, why=why_skip) ) | |
continue | |
KANJI_LIST.append(result) | |
KANJI_RETRY_done.append(current_kanji) | |
if ( line_number > 0 and | |
((line_number == SAVE_EVERY_ITEMS) or (line_number%SAVE_EVERY_ITEMS == 0) | |
)): | |
print(" .... saving temporal file on line " , line_number) | |
save_file(is_part=True) | |
#must "merge" this new list with the old one (temporal) | |
if os.path.exists("complete-" + INPUT_FILE + ".part"): | |
temporal_list = list() | |
with codecs.open("complete-" + INPUT_FILE + ".part", 'r', encoding='utf-8-sig') as filee: | |
temporal_list = filee.read().splitlines() | |
temporal_list = temporal_list[1:] #first one holds header | |
for line in temporal_list: | |
tmp = line.split("\t") | |
KANJI_LIST.append( | |
Kanji( | |
kanji=tmp[0] | |
, meaning_en=tmp[1] | |
, strokes=tmp[2] | |
, radicals=tmp[3] | |
, components=tmp[4] | |
, joyo=tmp[5] | |
, grade=tmp[6] | |
, jlpt=tmp[7] | |
, frequency=tmp[8] | |
, is_word=tmp[9] | |
, compounds=tmp[10] | |
, meaning_sp=tmp[11] | |
, rtk6=tmp[12] | |
, rtk=tmp[13] | |
, readings=tmp[14] | |
) | |
) | |
# phew, done! now sort by stroke count | |
KANJI_LIST = sorted(KANJI_LIST, key=lambda tup: int(tup[2])) | |
# create CSV and save! | |
save_file() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment