Skip to content

Instantly share code, notes, and snippets.

@aquinzi
Created Apr 11, 2021
Embed
What would you like to do?
Download routledge common list from kanshudo, extracting kanshudo frequency
# -*- coding: utf_8 -*-
#
# 2021-04-07
#
# Download routledge common list from kanshudo, extracting kanshudo frequency
from bs4 import BeautifulSoup
import requests
import csv
url_base = "https://www.kanshudo.com/collections/routledge/"
slugs_pages = ( "RT-1", "RT-101", "RT-201", "RT-301", "RT-401", "RT-501", "RT-601",
"RT-701", "RT-801", "RT-901", "RT-1000", "RT-1101", "RT-1201", "RT-1301", "RT-1401",
"RT-1501", "RT-1601", "RT-1701", "RT-1801", "RT-1901", "RT-2002", "RT-2101", "RT-2201",
"RT-2301", "RT-2401", "RT-2501", "RT-2601", "RT-2701", "RT-2801", "RT-2901", "RT-3001",
"RT-3101", "RT-3201", "RT-3301", "RT-3401", "RT-3501", "RT-3601", "RT-3701", "RT-3801",
"RT-3902", "RT-4001", "RT-4101", "RT-4201", "RT-4301", "RT-4401", "RT-4501", "RT-4601",
"RT-4701", "RT-4802", "RT-4901" )
listado_palabras = list()
slug_anterior = ""
for slug in slugs_pages:
print("procesando " + url_base + slug)
r = requests.get(url_base + slug)
soup = BeautifulSoup(r.text, 'html.parser')
#soup = BeautifulSoup(html_test,'html.parser')
partes = soup.find_all('div', class_="jukugorow")
for palabra in partes:
datos = dict()
theword = palabra.find('div', class_="jukugo")
kanji = palabra.find('div', class_="f_kanji")
kana = palabra.find('div', class_="furigana")
definition = palabra.find('div', class_="jukugo_reading")
level_jlpt = palabra.find('div', class_="jlpt_container")
level_kanshudo = palabra.find('div', class_="ufn_container")
kanji = kanji.string if kanji else ""
kana = kana.string if kana else ""
if level_kanshudo:
for item in level_kanshudo.span['class']:
if item.startswith("ja-ufn"):
level_kanshudo = item.replace("ja-ufn_", "")
break
if level_jlpt:
for item in level_jlpt.span['class']:
if item.startswith("ja-jlpt"):
level_jlpt = item.replace("ja-jlpt_", "")
break
if theword and kanji and kana:
theworld_nofuri = str(theword)
theworld_nofuri = theworld_nofuri.replace(str(kana), "")
microsoup = BeautifulSoup(theworld_nofuri,'html.parser')
theworld_nofuri = ""
for l in microsoup.stripped_strings:
theworld_nofuri += l
theworld_nokanji = str(theword)
theworld_nokanji = theworld_nokanji.replace(str(kanji), "")
microsoup = BeautifulSoup(theworld_nokanji,'html.parser')
theworld_nokanji = ""
for l in microsoup.stripped_strings:
theworld_nokanji += l
kanji = theworld_nofuri
kana = theworld_nokanji
theword = theword.a.string if theword else ""
if (not kanji and not kana) and theword:
datos['kanji'] = theword
datos['kana'] = theword
else:
datos['kanji'] = kanji
datos['kana'] = kana
datos['definition'] = definition
datos['level_jlpt'] = level_jlpt
datos['level_kanshudo'] = level_kanshudo
datos['routledge_rango'] = slug_anterior + " - " + slug
listado_palabras.append(datos)
# print(datos['kanji'] + "\t" + datos['kana'] + "\t" , datos['routledge_rango'])
slug_anterior = slug
#answer = input("Continuar? ")
#if answer == "n":
# break
with open('routledge-kanshudo.csv', 'w', newline='') as csvfile:
csv_headers = listado_palabras[0].keys()
spamwriter = csv.DictWriter(csvfile, fieldnames=csv_headers)
spamwriter.writeheader()
for item_lista in listado_palabras:
spamwriter.writerow(item_lista)
print("Listo!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment