aquinzi/kanshudo-routledge.py Secret

## kanshudo-routledge.py
# -*- coding: utf_8 -*-
#
# 2021-04-07
#
# Download routledge common list from kanshudo, extracting kanshudo frequency


from bs4 import BeautifulSoup
import requests
import csv

url_base = "https://www.kanshudo.com/collections/routledge/"
slugs_pages = ( "RT-1", "RT-101", "RT-201", "RT-301", "RT-401", "RT-501", "RT-601",
   "RT-701", "RT-801", "RT-901", "RT-1000", "RT-1101", "RT-1201", "RT-1301", "RT-1401",
   "RT-1501", "RT-1601", "RT-1701", "RT-1801", "RT-1901", "RT-2002", "RT-2101", "RT-2201",
   "RT-2301", "RT-2401", "RT-2501", "RT-2601", "RT-2701", "RT-2801", "RT-2901", "RT-3001",
   "RT-3101", "RT-3201", "RT-3301", "RT-3401", "RT-3501", "RT-3601", "RT-3701", "RT-3801",
   "RT-3902", "RT-4001", "RT-4101", "RT-4201", "RT-4301", "RT-4401", "RT-4501", "RT-4601",
   "RT-4701", "RT-4802", "RT-4901" )


listado_palabras = list()
slug_anterior = ""

for slug in slugs_pages:
    print("procesando " + url_base + slug)

    r = requests.get(url_base + slug)
    soup = BeautifulSoup(r.text, 'html.parser')

    #soup = BeautifulSoup(html_test,'html.parser')

    partes = soup.find_all('div', class_="jukugorow")

    for palabra in partes:

        datos = dict()

        theword = palabra.find('div', class_="jukugo")
        kanji = palabra.find('div', class_="f_kanji")
        kana = palabra.find('div', class_="furigana")
        definition = palabra.find('div', class_="jukugo_reading")
        level_jlpt = palabra.find('div', class_="jlpt_container")
        level_kanshudo = palabra.find('div', class_="ufn_container")


        kanji = kanji.string if kanji else ""
        kana = kana.string if kana else ""

        if level_kanshudo:
            for item in level_kanshudo.span['class']:
                if item.startswith("ja-ufn"):
                    level_kanshudo = item.replace("ja-ufn_", "")
                    break

        if level_jlpt:
            for item in level_jlpt.span['class']:
                if item.startswith("ja-jlpt"):
                    level_jlpt = item.replace("ja-jlpt_", "")
                    break

        if theword and kanji and kana:

            theworld_nofuri = str(theword)
            theworld_nofuri =  theworld_nofuri.replace(str(kana), "")
            microsoup = BeautifulSoup(theworld_nofuri,'html.parser')

            theworld_nofuri = ""
            for l in microsoup.stripped_strings:
                theworld_nofuri += l


            theworld_nokanji = str(theword)
            theworld_nokanji =  theworld_nokanji.replace(str(kanji), "")

            microsoup = BeautifulSoup(theworld_nokanji,'html.parser')

            theworld_nokanji = ""
            for l in microsoup.stripped_strings:
                theworld_nokanji += l


            kanji = theworld_nofuri
            kana = theworld_nokanji

        theword = theword.a.string if theword else ""


        if (not kanji and not kana) and theword:
            datos['kanji'] = theword
            datos['kana'] = theword
        else:
            datos['kanji'] = kanji
            datos['kana'] = kana

        datos['definition'] = definition
        datos['level_jlpt'] = level_jlpt
        datos['level_kanshudo'] = level_kanshudo
        datos['routledge_rango'] = slug_anterior + " - " + slug


        listado_palabras.append(datos)
        # print(datos['kanji'] + "\t" + datos['kana'] + "\t" , datos['routledge_rango'])


    slug_anterior = slug

    #answer = input("Continuar? ")
    #if answer == "n":
    #    break

with open('routledge-kanshudo.csv', 'w', newline='') as csvfile:
    csv_headers = listado_palabras[0].keys()
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_headers)
    spamwriter.writeheader()

    for item_lista in listado_palabras:
        spamwriter.writerow(item_lista)


print("Listo!")
	# -- coding: utf_8 --
	#
	# 2021-04-07
	#
	# Download routledge common list from kanshudo, extracting kanshudo frequency



	from bs4 import BeautifulSoup
	import requests
	import csv

	url_base = "https://www.kanshudo.com/collections/routledge/"
	slugs_pages = ( "RT-1", "RT-101", "RT-201", "RT-301", "RT-401", "RT-501", "RT-601",
	"RT-701", "RT-801", "RT-901", "RT-1000", "RT-1101", "RT-1201", "RT-1301", "RT-1401",
	"RT-1501", "RT-1601", "RT-1701", "RT-1801", "RT-1901", "RT-2002", "RT-2101", "RT-2201",
	"RT-2301", "RT-2401", "RT-2501", "RT-2601", "RT-2701", "RT-2801", "RT-2901", "RT-3001",
	"RT-3101", "RT-3201", "RT-3301", "RT-3401", "RT-3501", "RT-3601", "RT-3701", "RT-3801",
	"RT-3902", "RT-4001", "RT-4101", "RT-4201", "RT-4301", "RT-4401", "RT-4501", "RT-4601",
	"RT-4701", "RT-4802", "RT-4901" )



	listado_palabras = list()
	slug_anterior = ""

	for slug in slugs_pages:
	print("procesando " + url_base + slug)

	r = requests.get(url_base + slug)
	soup = BeautifulSoup(r.text, 'html.parser')

	#soup = BeautifulSoup(html_test,'html.parser')

	partes = soup.find_all('div', class_="jukugorow")

	for palabra in partes:

	datos = dict()

	theword = palabra.find('div', class_="jukugo")
	kanji = palabra.find('div', class_="f_kanji")
	kana = palabra.find('div', class_="furigana")
	definition = palabra.find('div', class_="jukugo_reading")
	level_jlpt = palabra.find('div', class_="jlpt_container")
	level_kanshudo = palabra.find('div', class_="ufn_container")


	kanji = kanji.string if kanji else ""
	kana = kana.string if kana else ""

	if level_kanshudo:
	for item in level_kanshudo.span['class']:
	if item.startswith("ja-ufn"):
	level_kanshudo = item.replace("ja-ufn_", "")
	break

	if level_jlpt:
	for item in level_jlpt.span['class']:
	if item.startswith("ja-jlpt"):
	level_jlpt = item.replace("ja-jlpt_", "")
	break

	if theword and kanji and kana:

	theworld_nofuri = str(theword)
	theworld_nofuri = theworld_nofuri.replace(str(kana), "")
	microsoup = BeautifulSoup(theworld_nofuri,'html.parser')

	theworld_nofuri = ""
	for l in microsoup.stripped_strings:
	theworld_nofuri += l


	theworld_nokanji = str(theword)
	theworld_nokanji = theworld_nokanji.replace(str(kanji), "")

	microsoup = BeautifulSoup(theworld_nokanji,'html.parser')

	theworld_nokanji = ""
	for l in microsoup.stripped_strings:
	theworld_nokanji += l


	kanji = theworld_nofuri
	kana = theworld_nokanji

	theword = theword.a.string if theword else ""



	if (not kanji and not kana) and theword:
	datos['kanji'] = theword
	datos['kana'] = theword
	else:
	datos['kanji'] = kanji
	datos['kana'] = kana

	datos['definition'] = definition
	datos['level_jlpt'] = level_jlpt
	datos['level_kanshudo'] = level_kanshudo
	datos['routledge_rango'] = slug_anterior + " - " + slug


	listado_palabras.append(datos)
	# print(datos['kanji'] + "\t" + datos['kana'] + "\t" , datos['routledge_rango'])


	slug_anterior = slug

	#answer = input("Continuar? ")
	#if answer == "n":
	# break

	with open('routledge-kanshudo.csv', 'w', newline='') as csvfile:
	csv_headers = listado_palabras[0].keys()
	spamwriter = csv.DictWriter(csvfile, fieldnames=csv_headers)
	spamwriter.writeheader()

	for item_lista in listado_palabras:
	spamwriter.writerow(item_lista)



	print("Listo!")