Download Hirogaru articles to cleaner HTML/TXT. Only JP version
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Download Hirogaru articles to cleaner HTML/TXT. Only JP version unless "watashi"! | |
# Notes: | |
# Saves in current working dir! | |
# In HTML, links to audio | |
# saves TXT version (ruby in ()s, no audio) | |
# Only articles (eg. has /article/ in url), Shoku (/shoku/), watashi (/watashi/) | |
# created around 2018-03-27 | |
#------------------------------------------------------------------------------- | |
import requests | |
from bs4 import BeautifulSoup | |
import csv | |
from collections import namedtuple | |
import os | |
import sys | |
DEBUG = False # true = use a string in script | |
print(" This scripts downloads Hirogaru articles from a csv/txt -check source for instructions-, ") | |
print(" cleans them and save them (in html and txt) to current working dir.") | |
Article = namedtuple('Article', ['url', 'slug', 'category_romaji', 'type']) | |
URLS = list() | |
UPDATE_CSV = "" | |
def html_parse_get_category(html_soup): | |
return html_soup.find(id="pankuzu").text.strip() | |
def html_parse_remove_figures(html_soup): | |
local = html_soup | |
while (1): | |
if "<figure>" in str(html_soup): | |
local.figure.extract() | |
else: | |
return html_soup | |
def html_parse_remove_extras(html_soup): | |
local = html_soup | |
try: | |
local.find(class_="quiz").extract() | |
except: | |
pass | |
try: | |
local.find(class_="change_page").extract() | |
except: | |
pass | |
try: | |
local.find(id="other_post").extract() | |
except: | |
pass | |
return local | |
def html_parse_add_rubies_parenthesis(html_soup): | |
local = html_soup | |
# add optional () to ruby | |
rubies = local.find_all("rt") | |
for ruby in rubies: | |
tag_open = local.new_tag("rp") | |
tag_close = local.new_tag("rp") | |
tag_open.string = "(" | |
tag_close.string = ")" | |
ruby.insert_before(tag_open) | |
ruby.insert_after(tag_close) | |
return local | |
def html_parse_remove_videos(html_soup): | |
local = html_soup | |
# if video, remove them but print message | |
videos = local.find_all("div",class_="eq_movie_article") | |
videos_rating = local.find_all("div",class_="eq_good") | |
if videos: | |
print(" !!! article has video/s") | |
for video in videos: | |
video.extract() | |
for rating in videos_rating: | |
rating.extract() | |
return local | |
def process_article(html_original): | |
''' Process using Beautiful Soup | |
:gets: text/string | |
:return: text/string | |
''' | |
soup = BeautifulSoup(html_original, "html.parser") | |
category = soup.find(id="pankuzu").text.strip() | |
article = soup.find(id="topics_detail").div | |
# dirty way to have title's content | |
tmp = article.h2.extract() | |
title = "<h1>" | |
for child in tmp.children: | |
title += str(child) | |
title += "</h1>" | |
title = BeautifulSoup(title, "html.parser") | |
while (1): | |
if "<figure>" in str(article): | |
article.figure.extract() | |
else: | |
break | |
try: | |
article.find(class_="quiz").extract() | |
except: | |
pass | |
try: | |
article.find(class_="change_page").extract() | |
except: | |
pass | |
#add controls to audio | |
audio_tags = article.find_all("audio") | |
for audio_tag in audio_tags: | |
source = audio_tag.find("source") | |
new_tag = soup.new_tag("audio") | |
new_tag["controls"] = "" | |
new_tag["preload"] = "auto" | |
audio_tag.unwrap() | |
source.wrap(new_tag) | |
# if video, remove them but print message | |
videos = article.find_all("div",class_="eq_movie_article") | |
videos_rating = article.find_all("div",class_="eq_good") | |
if videos: | |
print(" !!! article has video/s") | |
for video in videos: | |
video.extract() | |
for rating in videos_rating: | |
rating.extract() | |
article_english_site = False # not implemented! | |
if article_english_site: | |
# glossaries: remove tracking and change ul to dl | |
glossaries = article.find_all("aside",class_="glossary") | |
for glossary in glossaries: | |
del glossary.p["onclick"] | |
items = glossary.find_all("li") | |
for li in items: | |
dt = soup.new_tag("dt") | |
dd = soup.new_tag("dd") | |
tmp = li.contents | |
dt.string = tmp[0].replace(" : ","") | |
dd.string = tmp[1].string | |
li.insert_before(dt) | |
li.insert_after(dd) | |
li.extract() | |
dl = soup.new_tag("dl") | |
glossary.ul.wrap(dl) | |
glossary.ul.unwrap() | |
# change dls (dd) to paragraphs | |
dl_tags = article.find_all("dl", class_="voice_text_wrap") | |
for dl_tag in dl_tags: | |
new_tag = soup.new_tag("p") | |
new_tag.contents = dl_tag.find("dd").contents | |
dl_tag.insert_after(new_tag) | |
dl_tag.extract() | |
final_html = BeautifulSoup("<html><body><p>Category: " + category + "</p></body></html>", "html.parser") | |
final_html.p.insert_after(article) | |
final_html.p.insert_before(title) | |
# don't know what happens, but parse again to make it work | |
final_html = BeautifulSoup(final_html.prettify().replace('<div class="inner">',"<article>").replace("</div>","</article>"), "html.parser") | |
# add optional () to ruby | |
rubies = final_html.find_all("rt") | |
for ruby in rubies: | |
tag_open = soup.new_tag("rp") | |
tag_close = soup.new_tag("rp") | |
tag_open.string = "(" | |
tag_close.string = ")" | |
ruby.insert_before(tag_open) | |
ruby.insert_after(tag_close) | |
return final_html.prettify() | |
def process_shoku(html_original,slug): | |
soup = BeautifulSoup(html_original, "html.parser") | |
category = html_parse_get_category(soup) | |
article = soup.find(id="shoku_detail") | |
article = html_parse_remove_figures(article) | |
article = html_parse_remove_extras(article) | |
title = "食 - " + slug | |
final_html = BeautifulSoup("<html><body><h1>" + title + "</h1><p>Category: " + category + "</p></body></html>", "html.parser") | |
final_html.p.insert_after(article) | |
# don't know what happens, but parse again to make it work | |
final_html = BeautifulSoup(final_html.prettify().replace('<div class="inner">',"<article>").replace("</div>","</article>"), "html.parser") | |
final_html = html_parse_add_rubies_parenthesis(final_html) | |
return final_html.prettify() | |
def process_watashi(html_original,slug): | |
title = "トピックと私 - " + slug | |
soup = BeautifulSoup(html_original, "html.parser") | |
category = html_parse_get_category(soup) | |
article = soup.find(id="hanashi_detail").div | |
article = html_parse_remove_extras(article) | |
article = html_parse_remove_videos(article) | |
final_html = BeautifulSoup("<html><body><h1>" + title + "</h1><p>Category: " + category + "</p></body></html>", "html.parser") | |
final_html.p.insert_after(article) | |
# don't know what happens, but parse again to make it work | |
final_html = BeautifulSoup(final_html.prettify().replace('<div class="inner">',"<article>").replace("</div>","</article>"), "html.parser") | |
final_html = html_parse_add_rubies_parenthesis(final_html) | |
return final_html.prettify() | |
def html_to_txt(article_html): | |
''' convert HTML to txt | |
:gets: text/string | |
:return: text/string | |
''' | |
soup = BeautifulSoup(article_html, "html.parser") | |
title = "" | |
for string in soup.h1.stripped_strings: | |
title += string | |
category = soup.p.string.strip() | |
text = list() | |
paragraphs = soup.article.find_all("p") | |
for paragraph in paragraphs: | |
tmp = "" | |
for string in paragraph.stripped_strings: | |
tmp += string | |
text.append(tmp) | |
text = (os.linesep + os.linesep).join(text) | |
final_text = '''{title} | |
================ | |
{category} | |
{text} | |
''' | |
final_text = final_text.format(title=title,category=category,text=text) | |
return final_text | |
if not DEBUG: | |
if len(sys.argv) > 1: | |
if not os.path.exists(sys.argv[1]): | |
print(" File doesnt exist. Why are you so mean?") | |
exit() | |
if not sys.argv[1].endswith(".csv") and not sys.argv[1].endswith(".txt"): | |
print(" Not a valid file to read. Must be CSV/TXT") | |
exit() | |
UPDATE_CSV = sys.argv[1] | |
# read csv that has urls | |
# Format: | |
# 0 category | |
# 1 title | |
# 2 url | |
# .... etc | |
with open(UPDATE_CSV, 'r', encoding="utf-8") as tmp_file: | |
url_column = False | |
if UPDATE_CSV.endswith(".csv"): | |
tmp_rows = csv.reader(tmp_file, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) | |
while (not int(url_column)): | |
url_column = int(input(" Column for url? ")) | |
if UPDATE_CSV.endswith(".txt"): | |
tmp_rows = tmp_file.readlines() | |
for line in tmp_rows: | |
url = line | |
if url_column: | |
url = line[url_column] | |
url = url[0:-1].replace("https://hirogaru-nihongo.jp/","") #[0:-1] to remove last / | |
url_parts = url.split("/") | |
if len(url_parts) < 3: | |
continue | |
slug = url_parts[-1] | |
doc_type = "" | |
category = url_parts[0] | |
if "article" in url_parts and not "en" in url_parts: | |
doc_type = "article" | |
if "shoku" in url_parts and not "en" in url_parts: | |
doc_type = "shoku" | |
if "watashi-en" in url_parts: #brief translation | |
category = url_parts[1] | |
doc_type = "watashi" | |
if doc_type: | |
URLS.append(Article("https://hirogaru-nihongo.jp/" + url + "/", slug, category, doc_type)) | |
# download HTML using requests | |
for item in URLS: | |
r = requests.get(item.url) | |
name = item.category_romaji + " > " + item.slug + "(" + item.type + ")" | |
if not r: | |
print(" couldn't get page for: " + name) | |
continue | |
print("processing: " + name) | |
filename = item.category_romaji + "-" + item.type + "-" + item.slug | |
if item.type == "article": | |
final_html = process_article(r.text) | |
elif item.type == "shoku": | |
final_html = process_shoku(r.text, item.slug) | |
elif item.type == "watashi": | |
final_html = process_watashi(r.text, item.slug) | |
final_txt = html_to_txt(final_html) | |
with open(os.path.join(os.getcwd(), filename + ".html"), 'w', encoding="utf-8") as tmp_file: | |
tmp_file.write(final_html) | |
with open(os.path.join(os.getcwd(), filename + ".txt"), 'w', encoding="utf-8") as tmp_file: | |
tmp_file.write(final_txt) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment