Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Get kanji stroke orden diagrams' url from wikimedia , using a text file with kanjis (one kanji per line)
# -*- coding: utf_8 -*-
#
# Get kanji stroke orden diagrams' url from wikimedia , using a text file with kanjis (one kanji per line)
#
# Now it also can get svg diagrams ("download"/inline) from tangorin (similar diagrams than jisho.org
# but they dont use javascript to show it)
# created around: 2017-01-16
#TODO: allow downloading
import os
import sys
import codecs
try:
import requests
except Exception as e:
print(" must install python package: requests")
exit()
try:
from bs4 import BeautifulSoup
except Exception as e:
print(" must install python package: BeautifulSoup")
exit()
from collections import namedtuple
# --------------
# CONFIG
# --------------
WIKIMEDIA_STROKE_DIAGRAMS_JP = "https://commons.wikimedia.org/wiki/File:{kanji}-jbw.png"
WIKIMEDIA_STROKE_DIAGRAMS_BW = "https://commons.wikimedia.org/wiki/File:{kanji}-bw.png"
WIKIMEDIA_STROKE_DIAGRAMS_RED = "https://commons.wikimedia.org/wiki/File:{kanji}-red.png"
DIAGRAM_STYLES = ("jp", "bw", "red") #to loop above urls, in order of preference
TANGORIN_STROKE_DIAGRAMS = "http://tangorin.com/dict.php?dict=kanji&s={kanji}" #SVG
# --------------
# END CONFIG
# --------------
if len(sys.argv) < 2:
print(" Must specify input")
exit()
args = sys.argv[1:]
INPUT_FILE = args[0]
ONLY_SVG = False
ONLY_IMG = False
if "--help" in args:
print (" get kanji diagrams for stroke order. INPUT_FILE [options, optional]")
print(" --only-img, --only-imgs get only pngs (Wikimedia)")
print(" --only-svg get only svgs (Tangorin)")
if not os.path.exists(INPUT_FILE):
print (" Input file doesnt exist")
exit()
if "--only-svg" in args:
ONLY_SVG = True
if "--only-img" in args or "--only-imgs" in args:
ONLY_IMG = True
if ONLY_SVG and ONLY_IMG:
print (" Please, only one. We didn't want to think about merging both together;")
print (" just run the script twice.")
exit()
def get_diagram(kanji,diagram_type):
'''
param:kanji str
param:diagram_type str jw or wb
'''
if diagram_type == "jp":
r = requests.get(WIKIMEDIA_STROKE_DIAGRAMS_JP.format(kanji=kanji))
elif diagram_type == "wb":
r = requests.get(WIKIMEDIA_STROKE_DIAGRAMS_BW.format(kanji=kanji))
elif diagram_type == "red":
r = requests.get(WIKIMEDIA_STROKE_DIAGRAMS_RED.format(kanji=kanji))
else:
return False
try:
BeautifulSoup(r.text).find(id="file").a['href']
except AttributeError as e:
return False
else:
return BeautifulSoup(r.text).find(id="file").a['href']
def get_diagram_svg(kanji):
'''
param:kanji str
'''
r = requests.get(TANGORIN_STROKE_DIAGRAMS.format(kanji=kanji))
try:
BeautifulSoup(r.text).find(class_="k-sod").contents
except AttributeError as e:
return False
#dirty way to convert beatifulsoup tags to str
soup = BeautifulSoup(r.text).find(class_="k-sod").contents
tmp = list()
tmp += [str(item) for item in soup]
return "".join(tmp)
# -----------
# Start!
# -----------
# collect data
print(" Reading file... ")
KANJI_LIST = list() # holds tuples
text_list = list()
KANJI_LIST_FAILED = list()
if ONLY_IMG:
Kanji = namedtuple('Kanji', 'kanji diagram_url')
if ONLY_SVG:
Kanji = namedtuple('Kanji', 'kanji diagram_svg')
with codecs.open(INPUT_FILE, 'r', encoding='utf-8-sig') as filee:
text_list = filee.read().splitlines()
text_list = tuple(set(text_list)) #remove possible duplicates
if not text_list:
print (" empty file")
exit()
for line_number, line in enumerate(text_list):
current_kanji = line.strip().strip('"')
real_line_number = line_number + 1
print (" processing line:", (real_line_number) )
if not current_kanji:
continue
found = False
if ONLY_IMG:
for style in DIAGRAM_STYLES:
result = get_diagram(current_kanji, style)
if result:
found = True
KANJI_LIST.append(Kanji(
kanji=current_kanji
, diagram_url=result
))
break
if ONLY_SVG:
result = get_diagram_svg(current_kanji)
if result:
found = True
KANJI_LIST.append(Kanji(
kanji=current_kanji
, diagram_svg=result
))
if not found:
KANJI_LIST_FAILED.append(current_kanji)
# TODO: should be in another way but whatever
final_text = list()
for item in KANJI_LIST:
if not item: #just in case
continue
final_text.append("\t".join(item))
final_text_diagram = ""
if ONLY_IMG:
final_text_diagram = "diagram_url"
if ONLY_SVG:
final_text_diagram = "diagram_svg"
final_text = "kanji\t" + final_text_diagram + "\n" + "\n".join(final_text)
if ONLY_IMG:
final_output_file = "diagrams-img-" + INPUT_FILE
if ONLY_SVG:
final_output_file = "diagrams-svg-" + INPUT_FILE
with open(final_output_file, 'w', encoding='utf-8') as output_file:
output_file.write(final_text)
if KANJI_LIST_FAILED:
print (" There are some kanji that whose diagrams couldn't be found.")
print (" They will be saved in another file.")
final_text = KANJI_LIST_FAILED
final_text = "kanji\n" + "\n".join(final_text)
if ONLY_IMG:
final_output_file = "diagrams-img-failed-" + INPUT_FILE
if ONLY_SVG:
final_output_file = "diagrams-svg-failed-" + INPUT_FILE
with open(final_output_file, 'w', encoding='utf-8') as output_file:
output_file.write(final_text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment