aquinzi/kanji-info-jisho.py Secret

## kanji-info-jisho.py
# -*- coding: utf_8 -*-
#
# Get kanji information from a list getting data from jisho.org (2017-01-12)
# Orders the final list by stroke count
#
# File must be: one kanji per line
#
# the frequency (newspaper) x/2500 (low = popular; ex. 日 is 1, in tagaini is 2501)


# FIX: better way to handle file names
# FIX: better way to handle temporal file when almost finishing (load and merge with current batch)
#TODO: more flexible when sorting, add:
#             - JLPT level
#             - frequency
#             - Heisig
#             - grade
# TODO: Allow sub sorting  (ex: jlpt level -> stroke count)
# FIX: still saving duplicates when using temporary file
# print something else instead of line ID, or try to get them in order
#FIX: avoid saving "empty" file for temporal (only saves heading)

import os
import sys
import codecs

try:
	import requests
except Exception as e:
	print(" must install python package: requests")
	exit()
try:
	from bs4 import BeautifulSoup
except Exception as e:
	print(" must install python package: BeautifulSoup")
	exit()

#for fun, use named tuples
from collections import namedtuple

# --------------
#    CONFIG
# --------------

JISHO_URL_KANJI_SEARCH = "http://jisho.org/search/%23kanji%20{kanji}"
LINE_BREAK_CSV = "<br>" #<br>for anki
FURIGANA = ('【','】') #jisho.org uses 【】, can change it to {} for example
SAVE_EVERY_ITEMS = 25

# --------------
#   END CONFIG
# --------------

def save_file(is_part=False):

	# TODO: should be in another way but whatever
	final_text = list()
	for item in KANJI_LIST:
		if not item: #just in case
			continue
		final_text.append("\t".join(item))

	tmp = 'kanji meaning_en strokes radicals components joyo grade jlpt frequency is_word compounds meaning_sp rtk rtk6 reading'.replace(" ", "\t")

	final_text = tmp + "\n" + "\n".join(final_text)
	save_mode = 'w'

	# TODO: better naming!
	final_output_file = "complete-" + INPUT_FILE

	if is_part:
		final_output_file += ".part"
		save_mode = 'a'

	with open(final_output_file, 'w', encoding='utf-8') as output_file:
		output_file.write(final_text)

	if not is_part:
		# delete temporal file

		final_output_file += ".part"
		if os.path.exists(final_output_file):
			os.remove(final_output_file)


def get_kanji_info(kanji_search):

	# query and get info
	r = requests.get(JISHO_URL_KANJI_SEARCH.format(kanji=kanji_search))

	# put it in a nice format
	# - dictionary indices:we only care:
	#    - Remembering The Kanji (common)
	#    - Remembering The Kanji, 6th edition (common)
	# for the vocabulary if we query it (also could work for our vocabulary deck):
	#   - word
	#   - reading
	#   - tags (common, jlpt, wanikani)
	#   - link to audio
	#   - definition
	#   - example sentences

	if not r.text:
		print (" page not found for " + kanji)
		return False

	#start parsing kanji data
	soup = BeautifulSoup(r.text).find(id="result_area")

	stroke_count = ""

	try:
		soup.find(class_="kanji-details__stroke_count").strong
	except Exception as e:
		KANJI_RETRY.append(current_kanji)
		return False, "stroke count"
	else:
		stroke_count = soup.find(class_="kanji-details__stroke_count").strong.string

	radicals   = ""
	components = "" #in csv
	for parts in soup.find_all(class_="radicals"):
		if parts.dl.dt.string == "Radical:":
			parts.dl.dd.span.find(class_="radical_meaning").extract()
			radicals = parts.dl.dd.get_text(",", strip=True)
		elif parts.dl.dt.string == "Parts:":
			components = parts.dl.dd.get_text(",", strip=True)

	meaning_english = ""
	meaning_english = soup.find(class_="kanji-details__main-meanings").get_text(",", strip=True)

	readings = ""
	readings = soup.find(class_="kanji-details__main-readings").get_text(",", strip=True)

	kanji_stats = soup.find(class_="kanji_stats")
	if not kanji_stats:
		return False, "kanji stats"

	taught_grade = ""
	try:
		kanji_stats.find(class_="grade").strong.string
	except Exception as e:
		taught_grade = '"-"'
		#return False, "grade"
	else:
		taught_grade = kanji_stats.find(class_="grade").strong.string

	is_joyo_kanji = False
	for tmp in kanji_stats.find(class_="grade").stripped_strings:
		 if (tmp.startswith("Jōyō kanji")):
			 is_joyo_kanji = True
			 break

	jlpt_level = ""

	try:
		kanji_stats.find(class_="jlpt").strong
	except Exception as e:
		jlpt_level = '"N0"'
	else:
		jlpt_level = kanji_stats.find(class_="jlpt").strong.string

	frequency  = ""
	try:
		kanji_stats.find(class_="frequency").strong
	except Exception as e:
		#return False, "frequency"
		frequency  = '"9999"'
	else:
		frequency  = kanji_stats.find(class_="frequency").strong.string


	compounds_on  = list()
	compounds_kun = list()
	is_word_alone = False

	compounds = ""
	compounds = soup.find("div", class_='compounds').find_all("div")

	heading_key = ""
	for box in compounds:
		if box.h2.string.startswith("On "):
			heading_key= "on"
		elif box.h2.string.startswith("Kun "):
				heading_key= "kun"

		for li in box.ul.stripped_strings:
			tmp = li.replace("\n"," ").replace("   ","") #very lame way to do that
			tmp = tmp.replace("【",FURIGANA[0]).replace("】",FURIGANA[1] + " ")

			if tmp.startswith(current_kanji + FURIGANA[0]):
				is_word_alone = True

			if heading_key == "on":
				compounds_on.append(tmp)
			elif heading_key == "kun":
				compounds_kun.append(tmp)


	spanish_meanings = ""
	spanish_meanings = soup.find(class_="spanish_meanings").ul.get_text(", ", strip=True)

	book_indices = {'RTK':'-','RTK6':'-'} #easier to get the book_id
	for row in soup.find("section", id="indices").find_all("tr"):
		 current_row = row.find_all("td")
		 book_title = current_row[1].string.strip()
		 # row.find_all("td") so we can use array for the two columns
		 if not book_title in (
				#"Japanese for Busy People",
				"Remembering The Kanji (James Heisig)",
				"Remembering The Kanji, 6th edition (James Heisig)",
				#"A Guide To Reading and Writing Japanese 3rd edition (Henshall, Seeley and De Groot)",
				#"A Guide To Remembering Japanese Characters (Kenneth G. Henshall)"
				):
				  continue
		 else:
				book_title = book_title.replace(" (James Heisig)", "").replace(
							  " (Henshall, Seeley and De Groot)",""
				).replace( "(Kenneth G. Henshall)","")
				book_title = book_title.replace("Remembering The Kanji","RTK").replace(", 6th edition","6")

				book_indices[book_title] = current_row[0].string.strip()

	return Kanji(
			  kanji=current_kanji
			, meaning_en='"' + meaning_english + '"'
			, strokes=stroke_count
			, radicals=radicals
			, components='"' + components + '"'
			, joyo='yes' if is_joyo_kanji else 'no'
			, grade=taught_grade
			, jlpt=jlpt_level
			, frequency=frequency
			, compounds='"' + LINE_BREAK_CSV.join(compounds_on) + LINE_BREAK_CSV + LINE_BREAK_CSV.join(compounds_kun) + '"'
			, meaning_sp='"' + spanish_meanings + '"'
			, rtk6=book_indices["RTK6"]
			, rtk=book_indices["RTK"]
			, is_word='yes' if is_word_alone else 'no'
			, readings=readings
			), True


Kanji = namedtuple('Kanji', 'kanji meaning_en strokes radicals components joyo grade jlpt frequency is_word compounds meaning_sp rtk rtk6 readings')

# parse args
if len(sys.argv) < 2:
	print(" Must specify input")
	exit()

if "-h" in sys.argv or "help" in sys.argv:
	print ("\n Specify input_file [options]")
	print ("\n options are: ")
	print ("   --start=      useful if there was some error and the script had to end")
	exit()


args = sys.argv[1:]

INPUT_FILE  = args[0]
START_FROM_LINE = 0

if not os.path.exists(INPUT_FILE):
	print (" Input file doesnt exist")
	exit()

if "--start=" in " ".join(args):
	tmp = " ".join(args).split("--start=")
	START_FROM_LINE = tmp[1].split()[0]
	try:
		int(START_FROM_LINE)
	except Exception as e:
		print (" converting string to number in the argument failed.")
	else:
		START_FROM_LINE = int(START_FROM_LINE)


# -----------
#  Start!
# -----------

# collect data
print(" Reading file... ")

KANJI_LIST = list() # holds tuples
text_list  = list()

with codecs.open(INPUT_FILE, 'r', encoding='utf-8-sig') as filee:
	text_list = filee.read().splitlines()

if not text_list:
	print (" empty file")
	exit()

text_list = tuple(set(text_list)) #remove possible duplicates

print (" Total lines: " , len(text_list))
KANJI_RETRY = list() #because sometimes it works and sometimes it doesnt....
temporal_list_before = list()

if START_FROM_LINE > 0:
	START_FROM_LINE -= 1

#must get old list (temporal) otherwise it gets rewritten
if os.path.exists("complete-" + INPUT_FILE + ".part"):
	temporal_list_before = list()
	with codecs.open("complete-" + INPUT_FILE + ".part", 'r', encoding='utf-8-sig') as filee:
		temporal_list_before = filee.read().splitlines()
	temporal_list_before = temporal_list_before[1:] #first one holds header

	for line in temporal_list_before:
		tmp = line.split("\t")
		KANJI_LIST.append(
			Kanji(
			  kanji=tmp[0]
			, meaning_en=tmp[1]
			, strokes=tmp[2]
			, radicals=tmp[3]
			, components=tmp[4]
			, joyo=tmp[5]
			, grade=tmp[6]
			, jlpt=tmp[7]
			, frequency=tmp[8]
			, is_word=tmp[9]
			, compounds=tmp[10]
			, meaning_sp=tmp[11]
			, rtk6=tmp[12]
			, rtk=tmp[13]
			, readings=tmp[14]
			)
		)


for line_number, line in enumerate(text_list[START_FROM_LINE:]):
	current_kanji = line.strip().strip('"')
	real_line_number = START_FROM_LINE + line_number + 1
	print (" processing line:", (real_line_number) )
	if not current_kanji:
		continue

	result, why_skip = get_kanji_info(current_kanji)

	if not result:
		print (" something wrong with the kanji on line: {line} - {why}".format(line=real_line_number, why=why_skip) )
		KANJI_RETRY.append(current_kanji)
		continue

	KANJI_LIST.append(result)

	if (
		 (real_line_number == SAVE_EVERY_ITEMS) or (real_line_number%SAVE_EVERY_ITEMS == 0)
		 ):
		#merge both lists now and forget later
		if (temporal_list_before):
			KANJI_LIST += temporal_list_before
			temporal_list_before = list()
		print(" .... saving temporal file on line " , real_line_number)
		save_file(is_part=True)

# go to missed kanji list
if KANJI_RETRY:
	print (" Retrying some kanji that got errors somewhere along the way")

	print(" .... saving temporal file on line " , real_line_number)
	save_file(is_part=True)

	KANJI_RETRY_done = list()

	for line_number, line in enumerate(KANJI_RETRY_original):
		current_kanji = line
		print (" processing l:", (line_number + 1) )
		if not current_kanji:
			continue
		if current_kanji in KANJI_RETRY_done:
			continue

		result, why_skip = get_kanji_info(current_kanji)

		if not result:
			print (" something wrong with the kanji on line: {line} - {why}".format(line=line_number, why=why_skip) )
			continue

		KANJI_LIST.append(result)
		KANJI_RETRY_done.append(current_kanji)


		if ( line_number > 0 and
			 ((line_number == SAVE_EVERY_ITEMS) or (line_number%SAVE_EVERY_ITEMS == 0)
			 			 )):
			print(" .... saving temporal file on line " , line_number)
			save_file(is_part=True)

#must "merge" this new list with the old one (temporal)
if os.path.exists("complete-" + INPUT_FILE + ".part"):
	temporal_list = list()
	with codecs.open("complete-" + INPUT_FILE + ".part", 'r', encoding='utf-8-sig') as filee:
		temporal_list = filee.read().splitlines()
	temporal_list = temporal_list[1:] #first one holds header

	for line in temporal_list:
		tmp = line.split("\t")
		KANJI_LIST.append(
			Kanji(
			  kanji=tmp[0]
			, meaning_en=tmp[1]
			, strokes=tmp[2]
			, radicals=tmp[3]
			, components=tmp[4]
			, joyo=tmp[5]
			, grade=tmp[6]
			, jlpt=tmp[7]
			, frequency=tmp[8]
			, is_word=tmp[9]
			, compounds=tmp[10]
			, meaning_sp=tmp[11]
			, rtk6=tmp[12]
			, rtk=tmp[13]
			, readings=tmp[14]
			)
		)

# phew, done! now sort by stroke count

KANJI_LIST = sorted(KANJI_LIST, key=lambda tup: int(tup[2]))

# create CSV and save!
save_file()
	# -- coding: utf_8 --
	#
	# Get kanji information from a list getting data from jisho.org (2017-01-12)
	# Orders the final list by stroke count
	#
	# File must be: one kanji per line
	#
	# the frequency (newspaper) x/2500 (low = popular; ex. 日 is 1, in tagaini is 2501)


	# FIX: better way to handle file names
	# FIX: better way to handle temporal file when almost finishing (load and merge with current batch)
	#TODO: more flexible when sorting, add:
	# - JLPT level
	# - frequency
	# - Heisig
	# - grade
	# TODO: Allow sub sorting (ex: jlpt level -> stroke count)
	# FIX: still saving duplicates when using temporary file
	# print something else instead of line ID, or try to get them in order
	#FIX: avoid saving "empty" file for temporal (only saves heading)

	import os
	import sys
	import codecs

	try:
	import requests
	except Exception as e:
	print(" must install python package: requests")
	exit()
	try:
	from bs4 import BeautifulSoup
	except Exception as e:
	print(" must install python package: BeautifulSoup")
	exit()

	#for fun, use named tuples
	from collections import namedtuple

	# --------------
	# CONFIG
	# --------------

	JISHO_URL_KANJI_SEARCH = "http://jisho.org/search/%23kanji%20{kanji}"
	LINE_BREAK_CSV = "<br>" #<br>for anki
	FURIGANA = ('【','】') #jisho.org uses 【】, can change it to {} for example
	SAVE_EVERY_ITEMS = 25

	# --------------
	# END CONFIG
	# --------------

	def save_file(is_part=False):

	# TODO: should be in another way but whatever
	final_text = list()
	for item in KANJI_LIST:
	if not item: #just in case
	continue
	final_text.append("\t".join(item))

	tmp = 'kanji meaning_en strokes radicals components joyo grade jlpt frequency is_word compounds meaning_sp rtk rtk6 reading'.replace(" ", "\t")

	final_text = tmp + "\n" + "\n".join(final_text)
	save_mode = 'w'

	# TODO: better naming!
	final_output_file = "complete-" + INPUT_FILE

	if is_part:
	final_output_file += ".part"
	save_mode = 'a'

	with open(final_output_file, 'w', encoding='utf-8') as output_file:
	output_file.write(final_text)

	if not is_part:
	# delete temporal file

	final_output_file += ".part"
	if os.path.exists(final_output_file):
	os.remove(final_output_file)




	def get_kanji_info(kanji_search):

	# query and get info
	r = requests.get(JISHO_URL_KANJI_SEARCH.format(kanji=kanji_search))

	# put it in a nice format
	# - dictionary indices:we only care:
	# - Remembering The Kanji (common)
	# - Remembering The Kanji, 6th edition (common)
	# for the vocabulary if we query it (also could work for our vocabulary deck):
	# - word
	# - reading
	# - tags (common, jlpt, wanikani)
	# - link to audio
	# - definition
	# - example sentences

	if not r.text:
	print (" page not found for " + kanji)
	return False

	#start parsing kanji data
	soup = BeautifulSoup(r.text).find(id="result_area")

	stroke_count = ""

	try:
	soup.find(class_="kanji-details__stroke_count").strong
	except Exception as e:
	KANJI_RETRY.append(current_kanji)
	return False, "stroke count"
	else:
	stroke_count = soup.find(class_="kanji-details__stroke_count").strong.string

	radicals = ""
	components = "" #in csv
	for parts in soup.find_all(class_="radicals"):
	if parts.dl.dt.string == "Radical:":
	parts.dl.dd.span.find(class_="radical_meaning").extract()
	radicals = parts.dl.dd.get_text(",", strip=True)
	elif parts.dl.dt.string == "Parts:":
	components = parts.dl.dd.get_text(",", strip=True)

	meaning_english = ""
	meaning_english = soup.find(class_="kanji-details__main-meanings").get_text(",", strip=True)

	readings = ""
	readings = soup.find(class_="kanji-details__main-readings").get_text(",", strip=True)

	kanji_stats = soup.find(class_="kanji_stats")
	if not kanji_stats:
	return False, "kanji stats"

	taught_grade = ""
	try:
	kanji_stats.find(class_="grade").strong.string
	except Exception as e:
	taught_grade = '"-"'
	#return False, "grade"
	else:
	taught_grade = kanji_stats.find(class_="grade").strong.string

	is_joyo_kanji = False
	for tmp in kanji_stats.find(class_="grade").stripped_strings:
	if (tmp.startswith("Jōyō kanji")):
	is_joyo_kanji = True
	break

	jlpt_level = ""

	try:
	kanji_stats.find(class_="jlpt").strong
	except Exception as e:
	jlpt_level = '"N0"'
	else:
	jlpt_level = kanji_stats.find(class_="jlpt").strong.string

	frequency = ""
	try:
	kanji_stats.find(class_="frequency").strong
	except Exception as e:
	#return False, "frequency"
	frequency = '"9999"'
	else:
	frequency = kanji_stats.find(class_="frequency").strong.string


	compounds_on = list()
	compounds_kun = list()
	is_word_alone = False

	compounds = ""
	compounds = soup.find("div", class_='compounds').find_all("div")

	heading_key = ""
	for box in compounds:
	if box.h2.string.startswith("On "):
	heading_key= "on"
	elif box.h2.string.startswith("Kun "):
	heading_key= "kun"

	for li in box.ul.stripped_strings:
	tmp = li.replace("\n"," ").replace(" ","") #very lame way to do that
	tmp = tmp.replace("【",FURIGANA[0]).replace("】",FURIGANA[1] + " ")

	if tmp.startswith(current_kanji + FURIGANA[0]):
	is_word_alone = True

	if heading_key == "on":
	compounds_on.append(tmp)
	elif heading_key == "kun":
	compounds_kun.append(tmp)


	spanish_meanings = ""
	spanish_meanings = soup.find(class_="spanish_meanings").ul.get_text(", ", strip=True)

	book_indices = {'RTK':'-','RTK6':'-'} #easier to get the book_id
	for row in soup.find("section", id="indices").find_all("tr"):
	current_row = row.find_all("td")
	book_title = current_row[1].string.strip()
	# row.find_all("td") so we can use array for the two columns
	if not book_title in (
	#"Japanese for Busy People",
	"Remembering The Kanji (James Heisig)",
	"Remembering The Kanji, 6th edition (James Heisig)",
	#"A Guide To Reading and Writing Japanese 3rd edition (Henshall, Seeley and De Groot)",
	#"A Guide To Remembering Japanese Characters (Kenneth G. Henshall)"
	):
	continue
	else:
	book_title = book_title.replace(" (James Heisig)", "").replace(
	" (Henshall, Seeley and De Groot)",""
	).replace( "(Kenneth G. Henshall)","")
	book_title = book_title.replace("Remembering The Kanji","RTK").replace(", 6th edition","6")

	book_indices[book_title] = current_row[0].string.strip()

	return Kanji(
	kanji=current_kanji
	, meaning_en='"' + meaning_english + '"'
	, strokes=stroke_count
	, radicals=radicals
	, components='"' + components + '"'
	, joyo='yes' if is_joyo_kanji else 'no'
	, grade=taught_grade
	, jlpt=jlpt_level
	, frequency=frequency
	, compounds='"' + LINE_BREAK_CSV.join(compounds_on) + LINE_BREAK_CSV + LINE_BREAK_CSV.join(compounds_kun) + '"'
	, meaning_sp='"' + spanish_meanings + '"'
	, rtk6=book_indices["RTK6"]
	, rtk=book_indices["RTK"]
	, is_word='yes' if is_word_alone else 'no'
	, readings=readings
	), True



	Kanji = namedtuple('Kanji', 'kanji meaning_en strokes radicals components joyo grade jlpt frequency is_word compounds meaning_sp rtk rtk6 readings')

	# parse args
	if len(sys.argv) < 2:
	print(" Must specify input")
	exit()

	if "-h" in sys.argv or "help" in sys.argv:
	print ("\n Specify input_file [options]")
	print ("\n options are: ")
	print (" --start= useful if there was some error and the script had to end")
	exit()


	args = sys.argv[1:]

	INPUT_FILE = args[0]
	START_FROM_LINE = 0

	if not os.path.exists(INPUT_FILE):
	print (" Input file doesnt exist")
	exit()

	if "--start=" in " ".join(args):
	tmp = " ".join(args).split("--start=")
	START_FROM_LINE = tmp[1].split()[0]
	try:
	int(START_FROM_LINE)
	except Exception as e:
	print (" converting string to number in the argument failed.")
	else:
	START_FROM_LINE = int(START_FROM_LINE)



	# -----------
	# Start!
	# -----------

	# collect data
	print(" Reading file... ")

	KANJI_LIST = list() # holds tuples
	text_list = list()

	with codecs.open(INPUT_FILE, 'r', encoding='utf-8-sig') as filee:
	text_list = filee.read().splitlines()

	if not text_list:
	print (" empty file")
	exit()

	text_list = tuple(set(text_list)) #remove possible duplicates

	print (" Total lines: " , len(text_list))
	KANJI_RETRY = list() #because sometimes it works and sometimes it doesnt....
	temporal_list_before = list()

	if START_FROM_LINE > 0:
	START_FROM_LINE -= 1

	#must get old list (temporal) otherwise it gets rewritten
	if os.path.exists("complete-" + INPUT_FILE + ".part"):
	temporal_list_before = list()
	with codecs.open("complete-" + INPUT_FILE + ".part", 'r', encoding='utf-8-sig') as filee:
	temporal_list_before = filee.read().splitlines()
	temporal_list_before = temporal_list_before[1:] #first one holds header

	for line in temporal_list_before:
	tmp = line.split("\t")
	KANJI_LIST.append(
	Kanji(
	kanji=tmp[0]
	, meaning_en=tmp[1]
	, strokes=tmp[2]
	, radicals=tmp[3]
	, components=tmp[4]
	, joyo=tmp[5]
	, grade=tmp[6]
	, jlpt=tmp[7]
	, frequency=tmp[8]
	, is_word=tmp[9]
	, compounds=tmp[10]
	, meaning_sp=tmp[11]
	, rtk6=tmp[12]
	, rtk=tmp[13]
	, readings=tmp[14]
	)
	)



	for line_number, line in enumerate(text_list[START_FROM_LINE:]):
	current_kanji = line.strip().strip('"')
	real_line_number = START_FROM_LINE + line_number + 1
	print (" processing line:", (real_line_number) )
	if not current_kanji:
	continue

	result, why_skip = get_kanji_info(current_kanji)

	if not result:
	print (" something wrong with the kanji on line: {line} - {why}".format(line=real_line_number, why=why_skip) )
	KANJI_RETRY.append(current_kanji)
	continue

	KANJI_LIST.append(result)

	if (
	(real_line_number == SAVE_EVERY_ITEMS) or (real_line_number%SAVE_EVERY_ITEMS == 0)
	):
	#merge both lists now and forget later
	if (temporal_list_before):
	KANJI_LIST += temporal_list_before
	temporal_list_before = list()
	print(" .... saving temporal file on line " , real_line_number)
	save_file(is_part=True)

	# go to missed kanji list
	if KANJI_RETRY:
	print (" Retrying some kanji that got errors somewhere along the way")

	print(" .... saving temporal file on line " , real_line_number)
	save_file(is_part=True)

	KANJI_RETRY_done = list()

	for line_number, line in enumerate(KANJI_RETRY_original):
	current_kanji = line
	print (" processing l:", (line_number + 1) )
	if not current_kanji:
	continue
	if current_kanji in KANJI_RETRY_done:
	continue

	result, why_skip = get_kanji_info(current_kanji)

	if not result:
	print (" something wrong with the kanji on line: {line} - {why}".format(line=line_number, why=why_skip) )
	continue

	KANJI_LIST.append(result)
	KANJI_RETRY_done.append(current_kanji)


	if ( line_number > 0 and
	((line_number == SAVE_EVERY_ITEMS) or (line_number%SAVE_EVERY_ITEMS == 0)
	)):
	print(" .... saving temporal file on line " , line_number)
	save_file(is_part=True)

	#must "merge" this new list with the old one (temporal)
	if os.path.exists("complete-" + INPUT_FILE + ".part"):
	temporal_list = list()
	with codecs.open("complete-" + INPUT_FILE + ".part", 'r', encoding='utf-8-sig') as filee:
	temporal_list = filee.read().splitlines()
	temporal_list = temporal_list[1:] #first one holds header

	for line in temporal_list:
	tmp = line.split("\t")
	KANJI_LIST.append(
	Kanji(
	kanji=tmp[0]
	, meaning_en=tmp[1]
	, strokes=tmp[2]
	, radicals=tmp[3]
	, components=tmp[4]
	, joyo=tmp[5]
	, grade=tmp[6]
	, jlpt=tmp[7]
	, frequency=tmp[8]
	, is_word=tmp[9]
	, compounds=tmp[10]
	, meaning_sp=tmp[11]
	, rtk6=tmp[12]
	, rtk=tmp[13]
	, readings=tmp[14]
	)
	)

	# phew, done! now sort by stroke count

	KANJI_LIST = sorted(KANJI_LIST, key=lambda tup: int(tup[2]))

	# create CSV and save!
	save_file()