Skip to content

Instantly share code, notes, and snippets.

@aquinzi
Created April 11, 2021 21:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aquinzi/8c9a133d3658c6d9ea62301fa93a4983 to your computer and use it in GitHub Desktop.
Save aquinzi/8c9a133d3658c6d9ea62301fa93a4983 to your computer and use it in GitHub Desktop.
Get kanji information from a list getting data from jisho.org
# -*- coding: utf_8 -*-
#
# Get kanji information from a list getting data from jisho.org (2017-01-12)
# Orders the final list by stroke count
#
# File must be: one kanji per line
#
# the frequency (newspaper) x/2500 (low = popular; ex. 日 is 1, in tagaini is 2501)
# FIX: better way to handle file names
# FIX: better way to handle temporal file when almost finishing (load and merge with current batch)
#TODO: more flexible when sorting, add:
# - JLPT level
# - frequency
# - Heisig
# - grade
# TODO: Allow sub sorting (ex: jlpt level -> stroke count)
# FIX: still saving duplicates when using temporary file
# print something else instead of line ID, or try to get them in order
#FIX: avoid saving "empty" file for temporal (only saves heading)
import os
import sys
import codecs
try:
import requests
except Exception as e:
print(" must install python package: requests")
exit()
try:
from bs4 import BeautifulSoup
except Exception as e:
print(" must install python package: BeautifulSoup")
exit()
#for fun, use named tuples
from collections import namedtuple
# --------------
# CONFIG
# --------------
JISHO_URL_KANJI_SEARCH = "http://jisho.org/search/%23kanji%20{kanji}"
LINE_BREAK_CSV = "<br>" #<br>for anki
FURIGANA = ('【','】') #jisho.org uses 【】, can change it to {} for example
SAVE_EVERY_ITEMS = 25
# --------------
# END CONFIG
# --------------
def save_file(is_part=False):
# TODO: should be in another way but whatever
final_text = list()
for item in KANJI_LIST:
if not item: #just in case
continue
final_text.append("\t".join(item))
tmp = 'kanji meaning_en strokes radicals components joyo grade jlpt frequency is_word compounds meaning_sp rtk rtk6 reading'.replace(" ", "\t")
final_text = tmp + "\n" + "\n".join(final_text)
save_mode = 'w'
# TODO: better naming!
final_output_file = "complete-" + INPUT_FILE
if is_part:
final_output_file += ".part"
save_mode = 'a'
with open(final_output_file, 'w', encoding='utf-8') as output_file:
output_file.write(final_text)
if not is_part:
# delete temporal file
final_output_file += ".part"
if os.path.exists(final_output_file):
os.remove(final_output_file)
def get_kanji_info(kanji_search):
# query and get info
r = requests.get(JISHO_URL_KANJI_SEARCH.format(kanji=kanji_search))
# put it in a nice format
# - dictionary indices:we only care:
# - Remembering The Kanji (common)
# - Remembering The Kanji, 6th edition (common)
# for the vocabulary if we query it (also could work for our vocabulary deck):
# - word
# - reading
# - tags (common, jlpt, wanikani)
# - link to audio
# - definition
# - example sentences
if not r.text:
print (" page not found for " + kanji)
return False
#start parsing kanji data
soup = BeautifulSoup(r.text).find(id="result_area")
stroke_count = ""
try:
soup.find(class_="kanji-details__stroke_count").strong
except Exception as e:
KANJI_RETRY.append(current_kanji)
return False, "stroke count"
else:
stroke_count = soup.find(class_="kanji-details__stroke_count").strong.string
radicals = ""
components = "" #in csv
for parts in soup.find_all(class_="radicals"):
if parts.dl.dt.string == "Radical:":
parts.dl.dd.span.find(class_="radical_meaning").extract()
radicals = parts.dl.dd.get_text(",", strip=True)
elif parts.dl.dt.string == "Parts:":
components = parts.dl.dd.get_text(",", strip=True)
meaning_english = ""
meaning_english = soup.find(class_="kanji-details__main-meanings").get_text(",", strip=True)
readings = ""
readings = soup.find(class_="kanji-details__main-readings").get_text(",", strip=True)
kanji_stats = soup.find(class_="kanji_stats")
if not kanji_stats:
return False, "kanji stats"
taught_grade = ""
try:
kanji_stats.find(class_="grade").strong.string
except Exception as e:
taught_grade = '"-"'
#return False, "grade"
else:
taught_grade = kanji_stats.find(class_="grade").strong.string
is_joyo_kanji = False
for tmp in kanji_stats.find(class_="grade").stripped_strings:
if (tmp.startswith("Jōyō kanji")):
is_joyo_kanji = True
break
jlpt_level = ""
try:
kanji_stats.find(class_="jlpt").strong
except Exception as e:
jlpt_level = '"N0"'
else:
jlpt_level = kanji_stats.find(class_="jlpt").strong.string
frequency = ""
try:
kanji_stats.find(class_="frequency").strong
except Exception as e:
#return False, "frequency"
frequency = '"9999"'
else:
frequency = kanji_stats.find(class_="frequency").strong.string
compounds_on = list()
compounds_kun = list()
is_word_alone = False
compounds = ""
compounds = soup.find("div", class_='compounds').find_all("div")
heading_key = ""
for box in compounds:
if box.h2.string.startswith("On "):
heading_key= "on"
elif box.h2.string.startswith("Kun "):
heading_key= "kun"
for li in box.ul.stripped_strings:
tmp = li.replace("\n"," ").replace(" ","") #very lame way to do that
tmp = tmp.replace("【",FURIGANA[0]).replace("】",FURIGANA[1] + " ")
if tmp.startswith(current_kanji + FURIGANA[0]):
is_word_alone = True
if heading_key == "on":
compounds_on.append(tmp)
elif heading_key == "kun":
compounds_kun.append(tmp)
spanish_meanings = ""
spanish_meanings = soup.find(class_="spanish_meanings").ul.get_text(", ", strip=True)
book_indices = {'RTK':'-','RTK6':'-'} #easier to get the book_id
for row in soup.find("section", id="indices").find_all("tr"):
current_row = row.find_all("td")
book_title = current_row[1].string.strip()
# row.find_all("td") so we can use array for the two columns
if not book_title in (
#"Japanese for Busy People",
"Remembering The Kanji (James Heisig)",
"Remembering The Kanji, 6th edition (James Heisig)",
#"A Guide To Reading and Writing Japanese 3rd edition (Henshall, Seeley and De Groot)",
#"A Guide To Remembering Japanese Characters (Kenneth G. Henshall)"
):
continue
else:
book_title = book_title.replace(" (James Heisig)", "").replace(
" (Henshall, Seeley and De Groot)",""
).replace( "(Kenneth G. Henshall)","")
book_title = book_title.replace("Remembering The Kanji","RTK").replace(", 6th edition","6")
book_indices[book_title] = current_row[0].string.strip()
return Kanji(
kanji=current_kanji
, meaning_en='"' + meaning_english + '"'
, strokes=stroke_count
, radicals=radicals
, components='"' + components + '"'
, joyo='yes' if is_joyo_kanji else 'no'
, grade=taught_grade
, jlpt=jlpt_level
, frequency=frequency
, compounds='"' + LINE_BREAK_CSV.join(compounds_on) + LINE_BREAK_CSV + LINE_BREAK_CSV.join(compounds_kun) + '"'
, meaning_sp='"' + spanish_meanings + '"'
, rtk6=book_indices["RTK6"]
, rtk=book_indices["RTK"]
, is_word='yes' if is_word_alone else 'no'
, readings=readings
), True
Kanji = namedtuple('Kanji', 'kanji meaning_en strokes radicals components joyo grade jlpt frequency is_word compounds meaning_sp rtk rtk6 readings')
# parse args
if len(sys.argv) < 2:
print(" Must specify input")
exit()
if "-h" in sys.argv or "help" in sys.argv:
print ("\n Specify input_file [options]")
print ("\n options are: ")
print (" --start= useful if there was some error and the script had to end")
exit()
args = sys.argv[1:]
INPUT_FILE = args[0]
START_FROM_LINE = 0
if not os.path.exists(INPUT_FILE):
print (" Input file doesnt exist")
exit()
if "--start=" in " ".join(args):
tmp = " ".join(args).split("--start=")
START_FROM_LINE = tmp[1].split()[0]
try:
int(START_FROM_LINE)
except Exception as e:
print (" converting string to number in the argument failed.")
else:
START_FROM_LINE = int(START_FROM_LINE)
# -----------
# Start!
# -----------
# collect data
print(" Reading file... ")
KANJI_LIST = list() # holds tuples
text_list = list()
with codecs.open(INPUT_FILE, 'r', encoding='utf-8-sig') as filee:
text_list = filee.read().splitlines()
if not text_list:
print (" empty file")
exit()
text_list = tuple(set(text_list)) #remove possible duplicates
print (" Total lines: " , len(text_list))
KANJI_RETRY = list() #because sometimes it works and sometimes it doesnt....
temporal_list_before = list()
if START_FROM_LINE > 0:
START_FROM_LINE -= 1
#must get old list (temporal) otherwise it gets rewritten
if os.path.exists("complete-" + INPUT_FILE + ".part"):
temporal_list_before = list()
with codecs.open("complete-" + INPUT_FILE + ".part", 'r', encoding='utf-8-sig') as filee:
temporal_list_before = filee.read().splitlines()
temporal_list_before = temporal_list_before[1:] #first one holds header
for line in temporal_list_before:
tmp = line.split("\t")
KANJI_LIST.append(
Kanji(
kanji=tmp[0]
, meaning_en=tmp[1]
, strokes=tmp[2]
, radicals=tmp[3]
, components=tmp[4]
, joyo=tmp[5]
, grade=tmp[6]
, jlpt=tmp[7]
, frequency=tmp[8]
, is_word=tmp[9]
, compounds=tmp[10]
, meaning_sp=tmp[11]
, rtk6=tmp[12]
, rtk=tmp[13]
, readings=tmp[14]
)
)
for line_number, line in enumerate(text_list[START_FROM_LINE:]):
current_kanji = line.strip().strip('"')
real_line_number = START_FROM_LINE + line_number + 1
print (" processing line:", (real_line_number) )
if not current_kanji:
continue
result, why_skip = get_kanji_info(current_kanji)
if not result:
print (" something wrong with the kanji on line: {line} - {why}".format(line=real_line_number, why=why_skip) )
KANJI_RETRY.append(current_kanji)
continue
KANJI_LIST.append(result)
if (
(real_line_number == SAVE_EVERY_ITEMS) or (real_line_number%SAVE_EVERY_ITEMS == 0)
):
#merge both lists now and forget later
if (temporal_list_before):
KANJI_LIST += temporal_list_before
temporal_list_before = list()
print(" .... saving temporal file on line " , real_line_number)
save_file(is_part=True)
# go to missed kanji list
if KANJI_RETRY:
print (" Retrying some kanji that got errors somewhere along the way")
print(" .... saving temporal file on line " , real_line_number)
save_file(is_part=True)
KANJI_RETRY_done = list()
for line_number, line in enumerate(KANJI_RETRY_original):
current_kanji = line
print (" processing l:", (line_number + 1) )
if not current_kanji:
continue
if current_kanji in KANJI_RETRY_done:
continue
result, why_skip = get_kanji_info(current_kanji)
if not result:
print (" something wrong with the kanji on line: {line} - {why}".format(line=line_number, why=why_skip) )
continue
KANJI_LIST.append(result)
KANJI_RETRY_done.append(current_kanji)
if ( line_number > 0 and
((line_number == SAVE_EVERY_ITEMS) or (line_number%SAVE_EVERY_ITEMS == 0)
)):
print(" .... saving temporal file on line " , line_number)
save_file(is_part=True)
#must "merge" this new list with the old one (temporal)
if os.path.exists("complete-" + INPUT_FILE + ".part"):
temporal_list = list()
with codecs.open("complete-" + INPUT_FILE + ".part", 'r', encoding='utf-8-sig') as filee:
temporal_list = filee.read().splitlines()
temporal_list = temporal_list[1:] #first one holds header
for line in temporal_list:
tmp = line.split("\t")
KANJI_LIST.append(
Kanji(
kanji=tmp[0]
, meaning_en=tmp[1]
, strokes=tmp[2]
, radicals=tmp[3]
, components=tmp[4]
, joyo=tmp[5]
, grade=tmp[6]
, jlpt=tmp[7]
, frequency=tmp[8]
, is_word=tmp[9]
, compounds=tmp[10]
, meaning_sp=tmp[11]
, rtk6=tmp[12]
, rtk=tmp[13]
, readings=tmp[14]
)
)
# phew, done! now sort by stroke count
KANJI_LIST = sorted(KANJI_LIST, key=lambda tup: int(tup[2]))
# create CSV and save!
save_file()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment