Skip to content

Instantly share code, notes, and snippets.

@aquinzi
Created April 11, 2021 21:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aquinzi/21d2507df9627690ad09036d1b3f9eee to your computer and use it in GitHub Desktop.
Save aquinzi/21d2507df9627690ad09036d1b3f9eee to your computer and use it in GitHub Desktop.
Download Hirogaru articles to cleaner HTML/TXT. Only JP version
# -*- coding: utf-8 -*-
# Download Hirogaru articles to cleaner HTML/TXT. Only JP version unless "watashi"!
# Notes:
# Saves in current working dir!
# In HTML, links to audio
# saves TXT version (ruby in ()s, no audio)
# Only articles (eg. has /article/ in url), Shoku (/shoku/), watashi (/watashi/)
# created around 2018-03-27
#-------------------------------------------------------------------------------
import requests
from bs4 import BeautifulSoup
import csv
from collections import namedtuple
import os
import sys
DEBUG = False # true = use a string in script
print(" This scripts downloads Hirogaru articles from a csv/txt -check source for instructions-, ")
print(" cleans them and save them (in html and txt) to current working dir.")
Article = namedtuple('Article', ['url', 'slug', 'category_romaji', 'type'])
URLS = list()
UPDATE_CSV = ""
def html_parse_get_category(html_soup):
return html_soup.find(id="pankuzu").text.strip()
def html_parse_remove_figures(html_soup):
local = html_soup
while (1):
if "<figure>" in str(html_soup):
local.figure.extract()
else:
return html_soup
def html_parse_remove_extras(html_soup):
local = html_soup
try:
local.find(class_="quiz").extract()
except:
pass
try:
local.find(class_="change_page").extract()
except:
pass
try:
local.find(id="other_post").extract()
except:
pass
return local
def html_parse_add_rubies_parenthesis(html_soup):
local = html_soup
# add optional () to ruby
rubies = local.find_all("rt")
for ruby in rubies:
tag_open = local.new_tag("rp")
tag_close = local.new_tag("rp")
tag_open.string = "("
tag_close.string = ")"
ruby.insert_before(tag_open)
ruby.insert_after(tag_close)
return local
def html_parse_remove_videos(html_soup):
local = html_soup
# if video, remove them but print message
videos = local.find_all("div",class_="eq_movie_article")
videos_rating = local.find_all("div",class_="eq_good")
if videos:
print(" !!! article has video/s")
for video in videos:
video.extract()
for rating in videos_rating:
rating.extract()
return local
def process_article(html_original):
''' Process using Beautiful Soup
:gets: text/string
:return: text/string
'''
soup = BeautifulSoup(html_original, "html.parser")
category = soup.find(id="pankuzu").text.strip()
article = soup.find(id="topics_detail").div
# dirty way to have title's content
tmp = article.h2.extract()
title = "<h1>"
for child in tmp.children:
title += str(child)
title += "</h1>"
title = BeautifulSoup(title, "html.parser")
while (1):
if "<figure>" in str(article):
article.figure.extract()
else:
break
try:
article.find(class_="quiz").extract()
except:
pass
try:
article.find(class_="change_page").extract()
except:
pass
#add controls to audio
audio_tags = article.find_all("audio")
for audio_tag in audio_tags:
source = audio_tag.find("source")
new_tag = soup.new_tag("audio")
new_tag["controls"] = ""
new_tag["preload"] = "auto"
audio_tag.unwrap()
source.wrap(new_tag)
# if video, remove them but print message
videos = article.find_all("div",class_="eq_movie_article")
videos_rating = article.find_all("div",class_="eq_good")
if videos:
print(" !!! article has video/s")
for video in videos:
video.extract()
for rating in videos_rating:
rating.extract()
article_english_site = False # not implemented!
if article_english_site:
# glossaries: remove tracking and change ul to dl
glossaries = article.find_all("aside",class_="glossary")
for glossary in glossaries:
del glossary.p["onclick"]
items = glossary.find_all("li")
for li in items:
dt = soup.new_tag("dt")
dd = soup.new_tag("dd")
tmp = li.contents
dt.string = tmp[0].replace(" : ","")
dd.string = tmp[1].string
li.insert_before(dt)
li.insert_after(dd)
li.extract()
dl = soup.new_tag("dl")
glossary.ul.wrap(dl)
glossary.ul.unwrap()
# change dls (dd) to paragraphs
dl_tags = article.find_all("dl", class_="voice_text_wrap")
for dl_tag in dl_tags:
new_tag = soup.new_tag("p")
new_tag.contents = dl_tag.find("dd").contents
dl_tag.insert_after(new_tag)
dl_tag.extract()
final_html = BeautifulSoup("<html><body><p>Category: " + category + "</p></body></html>", "html.parser")
final_html.p.insert_after(article)
final_html.p.insert_before(title)
# don't know what happens, but parse again to make it work
final_html = BeautifulSoup(final_html.prettify().replace('<div class="inner">',"<article>").replace("</div>","</article>"), "html.parser")
# add optional () to ruby
rubies = final_html.find_all("rt")
for ruby in rubies:
tag_open = soup.new_tag("rp")
tag_close = soup.new_tag("rp")
tag_open.string = "("
tag_close.string = ")"
ruby.insert_before(tag_open)
ruby.insert_after(tag_close)
return final_html.prettify()
def process_shoku(html_original,slug):
soup = BeautifulSoup(html_original, "html.parser")
category = html_parse_get_category(soup)
article = soup.find(id="shoku_detail")
article = html_parse_remove_figures(article)
article = html_parse_remove_extras(article)
title = "食 - " + slug
final_html = BeautifulSoup("<html><body><h1>" + title + "</h1><p>Category: " + category + "</p></body></html>", "html.parser")
final_html.p.insert_after(article)
# don't know what happens, but parse again to make it work
final_html = BeautifulSoup(final_html.prettify().replace('<div class="inner">',"<article>").replace("</div>","</article>"), "html.parser")
final_html = html_parse_add_rubies_parenthesis(final_html)
return final_html.prettify()
def process_watashi(html_original,slug):
title = "トピックと私 - " + slug
soup = BeautifulSoup(html_original, "html.parser")
category = html_parse_get_category(soup)
article = soup.find(id="hanashi_detail").div
article = html_parse_remove_extras(article)
article = html_parse_remove_videos(article)
final_html = BeautifulSoup("<html><body><h1>" + title + "</h1><p>Category: " + category + "</p></body></html>", "html.parser")
final_html.p.insert_after(article)
# don't know what happens, but parse again to make it work
final_html = BeautifulSoup(final_html.prettify().replace('<div class="inner">',"<article>").replace("</div>","</article>"), "html.parser")
final_html = html_parse_add_rubies_parenthesis(final_html)
return final_html.prettify()
def html_to_txt(article_html):
''' convert HTML to txt
:gets: text/string
:return: text/string
'''
soup = BeautifulSoup(article_html, "html.parser")
title = ""
for string in soup.h1.stripped_strings:
title += string
category = soup.p.string.strip()
text = list()
paragraphs = soup.article.find_all("p")
for paragraph in paragraphs:
tmp = ""
for string in paragraph.stripped_strings:
tmp += string
text.append(tmp)
text = (os.linesep + os.linesep).join(text)
final_text = '''{title}
================
{category}
{text}
'''
final_text = final_text.format(title=title,category=category,text=text)
return final_text
if not DEBUG:
if len(sys.argv) > 1:
if not os.path.exists(sys.argv[1]):
print(" File doesnt exist. Why are you so mean?")
exit()
if not sys.argv[1].endswith(".csv") and not sys.argv[1].endswith(".txt"):
print(" Not a valid file to read. Must be CSV/TXT")
exit()
UPDATE_CSV = sys.argv[1]
# read csv that has urls
# Format:
# 0 category
# 1 title
# 2 url
# .... etc
with open(UPDATE_CSV, 'r', encoding="utf-8") as tmp_file:
url_column = False
if UPDATE_CSV.endswith(".csv"):
tmp_rows = csv.reader(tmp_file, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
while (not int(url_column)):
url_column = int(input(" Column for url? "))
if UPDATE_CSV.endswith(".txt"):
tmp_rows = tmp_file.readlines()
for line in tmp_rows:
url = line
if url_column:
url = line[url_column]
url = url[0:-1].replace("https://hirogaru-nihongo.jp/","") #[0:-1] to remove last /
url_parts = url.split("/")
if len(url_parts) < 3:
continue
slug = url_parts[-1]
doc_type = ""
category = url_parts[0]
if "article" in url_parts and not "en" in url_parts:
doc_type = "article"
if "shoku" in url_parts and not "en" in url_parts:
doc_type = "shoku"
if "watashi-en" in url_parts: #brief translation
category = url_parts[1]
doc_type = "watashi"
if doc_type:
URLS.append(Article("https://hirogaru-nihongo.jp/" + url + "/", slug, category, doc_type))
# download HTML using requests
for item in URLS:
r = requests.get(item.url)
name = item.category_romaji + " > " + item.slug + "(" + item.type + ")"
if not r:
print(" couldn't get page for: " + name)
continue
print("processing: " + name)
filename = item.category_romaji + "-" + item.type + "-" + item.slug
if item.type == "article":
final_html = process_article(r.text)
elif item.type == "shoku":
final_html = process_shoku(r.text, item.slug)
elif item.type == "watashi":
final_html = process_watashi(r.text, item.slug)
final_txt = html_to_txt(final_html)
with open(os.path.join(os.getcwd(), filename + ".html"), 'w', encoding="utf-8") as tmp_file:
tmp_file.write(final_html)
with open(os.path.join(os.getcwd(), filename + ".txt"), 'w', encoding="utf-8") as tmp_file:
tmp_file.write(final_txt)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment