-
-
Save aquinzi/8f2fcf72d751847520eb72775412241e to your computer and use it in GitHub Desktop.
Download routledge common list from kanshudo, extracting kanshudo frequency
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf_8 -*- | |
# | |
# 2021-04-07 | |
# | |
# Download routledge common list from kanshudo, extracting kanshudo frequency | |
from bs4 import BeautifulSoup | |
import requests | |
import csv | |
url_base = "https://www.kanshudo.com/collections/routledge/" | |
slugs_pages = ( "RT-1", "RT-101", "RT-201", "RT-301", "RT-401", "RT-501", "RT-601", | |
"RT-701", "RT-801", "RT-901", "RT-1000", "RT-1101", "RT-1201", "RT-1301", "RT-1401", | |
"RT-1501", "RT-1601", "RT-1701", "RT-1801", "RT-1901", "RT-2002", "RT-2101", "RT-2201", | |
"RT-2301", "RT-2401", "RT-2501", "RT-2601", "RT-2701", "RT-2801", "RT-2901", "RT-3001", | |
"RT-3101", "RT-3201", "RT-3301", "RT-3401", "RT-3501", "RT-3601", "RT-3701", "RT-3801", | |
"RT-3902", "RT-4001", "RT-4101", "RT-4201", "RT-4301", "RT-4401", "RT-4501", "RT-4601", | |
"RT-4701", "RT-4802", "RT-4901" ) | |
listado_palabras = list() | |
slug_anterior = "" | |
for slug in slugs_pages: | |
print("procesando " + url_base + slug) | |
r = requests.get(url_base + slug) | |
soup = BeautifulSoup(r.text, 'html.parser') | |
#soup = BeautifulSoup(html_test,'html.parser') | |
partes = soup.find_all('div', class_="jukugorow") | |
for palabra in partes: | |
datos = dict() | |
theword = palabra.find('div', class_="jukugo") | |
kanji = palabra.find('div', class_="f_kanji") | |
kana = palabra.find('div', class_="furigana") | |
definition = palabra.find('div', class_="jukugo_reading") | |
level_jlpt = palabra.find('div', class_="jlpt_container") | |
level_kanshudo = palabra.find('div', class_="ufn_container") | |
kanji = kanji.string if kanji else "" | |
kana = kana.string if kana else "" | |
if level_kanshudo: | |
for item in level_kanshudo.span['class']: | |
if item.startswith("ja-ufn"): | |
level_kanshudo = item.replace("ja-ufn_", "") | |
break | |
if level_jlpt: | |
for item in level_jlpt.span['class']: | |
if item.startswith("ja-jlpt"): | |
level_jlpt = item.replace("ja-jlpt_", "") | |
break | |
if theword and kanji and kana: | |
theworld_nofuri = str(theword) | |
theworld_nofuri = theworld_nofuri.replace(str(kana), "") | |
microsoup = BeautifulSoup(theworld_nofuri,'html.parser') | |
theworld_nofuri = "" | |
for l in microsoup.stripped_strings: | |
theworld_nofuri += l | |
theworld_nokanji = str(theword) | |
theworld_nokanji = theworld_nokanji.replace(str(kanji), "") | |
microsoup = BeautifulSoup(theworld_nokanji,'html.parser') | |
theworld_nokanji = "" | |
for l in microsoup.stripped_strings: | |
theworld_nokanji += l | |
kanji = theworld_nofuri | |
kana = theworld_nokanji | |
theword = theword.a.string if theword else "" | |
if (not kanji and not kana) and theword: | |
datos['kanji'] = theword | |
datos['kana'] = theword | |
else: | |
datos['kanji'] = kanji | |
datos['kana'] = kana | |
datos['definition'] = definition | |
datos['level_jlpt'] = level_jlpt | |
datos['level_kanshudo'] = level_kanshudo | |
datos['routledge_rango'] = slug_anterior + " - " + slug | |
listado_palabras.append(datos) | |
# print(datos['kanji'] + "\t" + datos['kana'] + "\t" , datos['routledge_rango']) | |
slug_anterior = slug | |
#answer = input("Continuar? ") | |
#if answer == "n": | |
# break | |
with open('routledge-kanshudo.csv', 'w', newline='') as csvfile: | |
csv_headers = listado_palabras[0].keys() | |
spamwriter = csv.DictWriter(csvfile, fieldnames=csv_headers) | |
spamwriter.writeheader() | |
for item_lista in listado_palabras: | |
spamwriter.writerow(item_lista) | |
print("Listo!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment