Get kanji stroke orden diagrams' url from wikimedia , using a text file with kanjis (one kanji per line)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf_8 -*- | |
# | |
# Get kanji stroke orden diagrams' url from wikimedia , using a text file with kanjis (one kanji per line) | |
# | |
# Now it also can get svg diagrams ("download"/inline) from tangorin (similar diagrams than jisho.org | |
# but they dont use javascript to show it) | |
# created around: 2017-01-16 | |
#TODO: allow downloading | |
import os | |
import sys | |
import codecs | |
try: | |
import requests | |
except Exception as e: | |
print(" must install python package: requests") | |
exit() | |
try: | |
from bs4 import BeautifulSoup | |
except Exception as e: | |
print(" must install python package: BeautifulSoup") | |
exit() | |
from collections import namedtuple | |
# -------------- | |
# CONFIG | |
# -------------- | |
WIKIMEDIA_STROKE_DIAGRAMS_JP = "https://commons.wikimedia.org/wiki/File:{kanji}-jbw.png" | |
WIKIMEDIA_STROKE_DIAGRAMS_BW = "https://commons.wikimedia.org/wiki/File:{kanji}-bw.png" | |
WIKIMEDIA_STROKE_DIAGRAMS_RED = "https://commons.wikimedia.org/wiki/File:{kanji}-red.png" | |
DIAGRAM_STYLES = ("jp", "bw", "red") #to loop above urls, in order of preference | |
TANGORIN_STROKE_DIAGRAMS = "http://tangorin.com/dict.php?dict=kanji&s={kanji}" #SVG | |
# -------------- | |
# END CONFIG | |
# -------------- | |
if len(sys.argv) < 2: | |
print(" Must specify input") | |
exit() | |
args = sys.argv[1:] | |
INPUT_FILE = args[0] | |
ONLY_SVG = False | |
ONLY_IMG = False | |
if "--help" in args: | |
print (" get kanji diagrams for stroke order. INPUT_FILE [options, optional]") | |
print(" --only-img, --only-imgs get only pngs (Wikimedia)") | |
print(" --only-svg get only svgs (Tangorin)") | |
if not os.path.exists(INPUT_FILE): | |
print (" Input file doesnt exist") | |
exit() | |
if "--only-svg" in args: | |
ONLY_SVG = True | |
if "--only-img" in args or "--only-imgs" in args: | |
ONLY_IMG = True | |
if ONLY_SVG and ONLY_IMG: | |
print (" Please, only one. We didn't want to think about merging both together;") | |
print (" just run the script twice.") | |
exit() | |
def get_diagram(kanji,diagram_type): | |
''' | |
param:kanji str | |
param:diagram_type str jw or wb | |
''' | |
if diagram_type == "jp": | |
r = requests.get(WIKIMEDIA_STROKE_DIAGRAMS_JP.format(kanji=kanji)) | |
elif diagram_type == "wb": | |
r = requests.get(WIKIMEDIA_STROKE_DIAGRAMS_BW.format(kanji=kanji)) | |
elif diagram_type == "red": | |
r = requests.get(WIKIMEDIA_STROKE_DIAGRAMS_RED.format(kanji=kanji)) | |
else: | |
return False | |
try: | |
BeautifulSoup(r.text).find(id="file").a['href'] | |
except AttributeError as e: | |
return False | |
else: | |
return BeautifulSoup(r.text).find(id="file").a['href'] | |
def get_diagram_svg(kanji): | |
''' | |
param:kanji str | |
''' | |
r = requests.get(TANGORIN_STROKE_DIAGRAMS.format(kanji=kanji)) | |
try: | |
BeautifulSoup(r.text).find(class_="k-sod").contents | |
except AttributeError as e: | |
return False | |
#dirty way to convert beatifulsoup tags to str | |
soup = BeautifulSoup(r.text).find(class_="k-sod").contents | |
tmp = list() | |
tmp += [str(item) for item in soup] | |
return "".join(tmp) | |
# ----------- | |
# Start! | |
# ----------- | |
# collect data | |
print(" Reading file... ") | |
KANJI_LIST = list() # holds tuples | |
text_list = list() | |
KANJI_LIST_FAILED = list() | |
if ONLY_IMG: | |
Kanji = namedtuple('Kanji', 'kanji diagram_url') | |
if ONLY_SVG: | |
Kanji = namedtuple('Kanji', 'kanji diagram_svg') | |
with codecs.open(INPUT_FILE, 'r', encoding='utf-8-sig') as filee: | |
text_list = filee.read().splitlines() | |
text_list = tuple(set(text_list)) #remove possible duplicates | |
if not text_list: | |
print (" empty file") | |
exit() | |
for line_number, line in enumerate(text_list): | |
current_kanji = line.strip().strip('"') | |
real_line_number = line_number + 1 | |
print (" processing line:", (real_line_number) ) | |
if not current_kanji: | |
continue | |
found = False | |
if ONLY_IMG: | |
for style in DIAGRAM_STYLES: | |
result = get_diagram(current_kanji, style) | |
if result: | |
found = True | |
KANJI_LIST.append(Kanji( | |
kanji=current_kanji | |
, diagram_url=result | |
)) | |
break | |
if ONLY_SVG: | |
result = get_diagram_svg(current_kanji) | |
if result: | |
found = True | |
KANJI_LIST.append(Kanji( | |
kanji=current_kanji | |
, diagram_svg=result | |
)) | |
if not found: | |
KANJI_LIST_FAILED.append(current_kanji) | |
# TODO: should be in another way but whatever | |
final_text = list() | |
for item in KANJI_LIST: | |
if not item: #just in case | |
continue | |
final_text.append("\t".join(item)) | |
final_text_diagram = "" | |
if ONLY_IMG: | |
final_text_diagram = "diagram_url" | |
if ONLY_SVG: | |
final_text_diagram = "diagram_svg" | |
final_text = "kanji\t" + final_text_diagram + "\n" + "\n".join(final_text) | |
if ONLY_IMG: | |
final_output_file = "diagrams-img-" + INPUT_FILE | |
if ONLY_SVG: | |
final_output_file = "diagrams-svg-" + INPUT_FILE | |
with open(final_output_file, 'w', encoding='utf-8') as output_file: | |
output_file.write(final_text) | |
if KANJI_LIST_FAILED: | |
print (" There are some kanji that whose diagrams couldn't be found.") | |
print (" They will be saved in another file.") | |
final_text = KANJI_LIST_FAILED | |
final_text = "kanji\n" + "\n".join(final_text) | |
if ONLY_IMG: | |
final_output_file = "diagrams-img-failed-" + INPUT_FILE | |
if ONLY_SVG: | |
final_output_file = "diagrams-svg-failed-" + INPUT_FILE | |
with open(final_output_file, 'w', encoding='utf-8') as output_file: | |
output_file.write(final_text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment