aquinzi/hirogaru-get-articles.py Secret

## hirogaru-get-articles.py
# -*- coding: utf-8 -*-

# Download Hirogaru articles to cleaner HTML/TXT. Only JP version unless "watashi"!
# Notes:
#    Saves in current working dir!
#    In HTML, links to audio
#    saves TXT version (ruby in ()s, no audio)
#    Only articles (eg. has /article/ in url), Shoku (/shoku/), watashi (/watashi/)

# created around 2018-03-27


#-------------------------------------------------------------------------------

import requests
from bs4 import BeautifulSoup
import csv
from collections import namedtuple
import os
import sys


DEBUG = False # true = use a string in script


print(" This scripts downloads Hirogaru articles from a csv/txt -check source for instructions-, ")
print(" cleans them and save them (in html and txt) to current working dir.")


Article = namedtuple('Article', ['url', 'slug', 'category_romaji', 'type'])

URLS = list()
UPDATE_CSV = ""


def html_parse_get_category(html_soup):
    return html_soup.find(id="pankuzu").text.strip()

def html_parse_remove_figures(html_soup):
	local = html_soup
	while (1):
		  if "<figure>" in str(html_soup):
		  	 local.figure.extract()
		  else:
		  	   return html_soup

def html_parse_remove_extras(html_soup):
	local = html_soup
	try:
		local.find(class_="quiz").extract()
	except:
		   pass

	try:
		local.find(class_="change_page").extract()
	except:
		   pass

	try:
		local.find(id="other_post").extract()
	except:
		   pass

	return local

def html_parse_add_rubies_parenthesis(html_soup):
	local = html_soup
	# add optional () to ruby
	rubies = local.find_all("rt")

	for ruby in rubies:
		tag_open = local.new_tag("rp")
		tag_close = local.new_tag("rp")
		tag_open.string = "("
		tag_close.string = ")"

		ruby.insert_before(tag_open)
		ruby.insert_after(tag_close)
	return local


def html_parse_remove_videos(html_soup):
	local = html_soup
	# if video, remove them but print message
	videos = local.find_all("div",class_="eq_movie_article")
	videos_rating = local.find_all("div",class_="eq_good")
	if videos:
	   print("   !!! article has video/s")
	   for video in videos:
		   video.extract()
	   for rating in videos_rating:
		   rating.extract()
	return local


def process_article(html_original):
	''' Process using Beautiful Soup
	:gets: text/string
	:return: text/string
	'''

	soup = BeautifulSoup(html_original, "html.parser")
	category = soup.find(id="pankuzu").text.strip()
	article = soup.find(id="topics_detail").div
	# dirty way to have title's content
	tmp = article.h2.extract()
	title = "<h1>"
	for child in tmp.children:
		title += str(child)
	title += "</h1>"
	title = BeautifulSoup(title, "html.parser")

	while (1):
		  if "<figure>" in str(article):
		  	 article.figure.extract()
		  else:
			   break

	try:
		article.find(class_="quiz").extract()
	except:
		   pass

	try:
		article.find(class_="change_page").extract()
	except:
		   pass

	#add controls to audio
	audio_tags = article.find_all("audio")
	for audio_tag in audio_tags:
		source = audio_tag.find("source")
		new_tag = soup.new_tag("audio")
		new_tag["controls"] = ""
		new_tag["preload"]  = "auto"
		audio_tag.unwrap()
		source.wrap(new_tag)

	# if video, remove them but print message
	videos = article.find_all("div",class_="eq_movie_article")
	videos_rating = article.find_all("div",class_="eq_good")
	if videos:
	   print("   !!! article has video/s")
	   for video in videos:
		   video.extract()
	   for rating in videos_rating:
		   rating.extract()

	article_english_site = False # not implemented!
	if article_english_site:
		# glossaries: remove tracking and change ul to dl
		glossaries = article.find_all("aside",class_="glossary")
		for glossary in glossaries:
			del glossary.p["onclick"]
			items = glossary.find_all("li")

			for li in items:
				dt = soup.new_tag("dt")
				dd = soup.new_tag("dd")

				tmp = li.contents
				dt.string = tmp[0].replace(" : ","")
				dd.string = tmp[1].string

				li.insert_before(dt)
				li.insert_after(dd)
				li.extract()

			dl = soup.new_tag("dl")
			glossary.ul.wrap(dl)
			glossary.ul.unwrap()


	# change dls (dd) to paragraphs
	dl_tags = article.find_all("dl", class_="voice_text_wrap")

	for dl_tag in dl_tags:
		new_tag = soup.new_tag("p")
		new_tag.contents = dl_tag.find("dd").contents
		dl_tag.insert_after(new_tag)
		dl_tag.extract()

	final_html = BeautifulSoup("<html><body><p>Category: " + category + "</p></body></html>", "html.parser")
	final_html.p.insert_after(article)
	final_html.p.insert_before(title)

	# don't know what happens, but parse again to make it work
	final_html = BeautifulSoup(final_html.prettify().replace('<div class="inner">',"<article>").replace("</div>","</article>"), "html.parser")

	# add optional () to ruby
	rubies = final_html.find_all("rt")

	for ruby in rubies:
		tag_open = soup.new_tag("rp")
		tag_close = soup.new_tag("rp")
		tag_open.string = "("
		tag_close.string = ")"

		ruby.insert_before(tag_open)
		ruby.insert_after(tag_close)


	return final_html.prettify()


def process_shoku(html_original,slug):

	soup = BeautifulSoup(html_original, "html.parser")

	category = html_parse_get_category(soup)

	article = soup.find(id="shoku_detail")

	article = html_parse_remove_figures(article)
	article = html_parse_remove_extras(article)

	title = "食 - " + slug


	final_html = BeautifulSoup("<html><body><h1>" + title + "</h1><p>Category: " + category + "</p></body></html>", "html.parser")
	final_html.p.insert_after(article)

	# don't know what happens, but parse again to make it work
	final_html = BeautifulSoup(final_html.prettify().replace('<div class="inner">',"<article>").replace("</div>","</article>"), "html.parser")

	final_html = html_parse_add_rubies_parenthesis(final_html)

	return final_html.prettify()


def process_watashi(html_original,slug):

	title = "トピックと私 - " + slug
	soup = BeautifulSoup(html_original, "html.parser")
	category = html_parse_get_category(soup)

	article = soup.find(id="hanashi_detail").div
	article = html_parse_remove_extras(article)

	article = html_parse_remove_videos(article)


	final_html = BeautifulSoup("<html><body><h1>" + title + "</h1><p>Category: " + category + "</p></body></html>", "html.parser")
	final_html.p.insert_after(article)

	# don't know what happens, but parse again to make it work
	final_html = BeautifulSoup(final_html.prettify().replace('<div class="inner">',"<article>").replace("</div>","</article>"), "html.parser")

	final_html = html_parse_add_rubies_parenthesis(final_html)

	return final_html.prettify()


def html_to_txt(article_html):
	''' convert HTML to txt
	:gets: text/string
	:return: text/string
	'''

	soup = BeautifulSoup(article_html, "html.parser")

	title = ""
	for string in soup.h1.stripped_strings:
	    title += string

	category = soup.p.string.strip()

	text = list()
	paragraphs  = soup.article.find_all("p")

	for paragraph in paragraphs:
		tmp = ""
		for string in paragraph.stripped_strings:
			tmp += string
		text.append(tmp)
	text = (os.linesep + os.linesep).join(text)

	final_text = '''{title}
================
{category}


{text}
'''
	final_text = final_text.format(title=title,category=category,text=text)
	return final_text


if not DEBUG:
	if len(sys.argv) > 1:
		if not os.path.exists(sys.argv[1]):
			print(" File doesnt exist. Why are you so mean?")
			exit()
		if not sys.argv[1].endswith(".csv") and not sys.argv[1].endswith(".txt"):
			print(" Not a valid file to read. Must be CSV/TXT")
			exit()

		UPDATE_CSV = sys.argv[1]

# read csv that has urls
# Format:
# 0 category
# 1 title
# 2 url
# .... etc


with open(UPDATE_CSV, 'r', encoding="utf-8") as tmp_file:
	 url_column = False
	 if UPDATE_CSV.endswith(".csv"):
	 	tmp_rows = csv.reader(tmp_file, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
	 	while (not int(url_column)):
		 	   url_column = int(input(" Column for url? "))
	 if UPDATE_CSV.endswith(".txt"):
	 	tmp_rows = tmp_file.readlines()

	 for line in tmp_rows:
		 url = line
		 if url_column:
		 	url = line[url_column]

		 url = url[0:-1].replace("https://hirogaru-nihongo.jp/","") #[0:-1] to remove last /
		 url_parts = url.split("/")

		 if len(url_parts) < 3:
		 	continue

		 slug = url_parts[-1]
		 doc_type = ""
		 category = url_parts[0]


		 if "article" in url_parts and not "en" in url_parts:
		 	doc_type = "article"

		 if "shoku" in url_parts and not "en" in url_parts:
		 	doc_type = "shoku"

		 if "watashi-en" in url_parts:    #brief translation
		 	category = url_parts[1]
		 	doc_type = "watashi"

		 if doc_type:
		 	URLS.append(Article("https://hirogaru-nihongo.jp/" + url + "/", slug, category, doc_type))


# download HTML using requests

for item in URLS:
	r = requests.get(item.url)
	name = item.category_romaji + " > " + item.slug + "(" + item.type + ")"
	if not r:
	   print(" couldn't get page for: " + name)
	   continue

	print("processing: " + name)
	filename = item.category_romaji + "-" + item.type + "-" + item.slug

	if item.type == "article":
	   final_html = process_article(r.text)
	elif item.type == "shoku":
		 final_html = process_shoku(r.text, item.slug)
	elif item.type == "watashi":
		 final_html = process_watashi(r.text, item.slug)

	final_txt = html_to_txt(final_html)

	with open(os.path.join(os.getcwd(), filename + ".html"), 'w', encoding="utf-8") as tmp_file:
		 tmp_file.write(final_html)

	with open(os.path.join(os.getcwd(), filename + ".txt"), 'w', encoding="utf-8") as tmp_file:
		 tmp_file.write(final_txt)
	# -- coding: utf-8 --

	# Download Hirogaru articles to cleaner HTML/TXT. Only JP version unless "watashi"!
	# Notes:
	# Saves in current working dir!
	# In HTML, links to audio
	# saves TXT version (ruby in ()s, no audio)
	# Only articles (eg. has /article/ in url), Shoku (/shoku/), watashi (/watashi/)

	# created around 2018-03-27


	#-------------------------------------------------------------------------------

	import requests
	from bs4 import BeautifulSoup
	import csv
	from collections import namedtuple
	import os
	import sys


	DEBUG = False # true = use a string in script




	print(" This scripts downloads Hirogaru articles from a csv/txt -check source for instructions-, ")
	print(" cleans them and save them (in html and txt) to current working dir.")


	Article = namedtuple('Article', ['url', 'slug', 'category_romaji', 'type'])

	URLS = list()
	UPDATE_CSV = ""


	def html_parse_get_category(html_soup):
	return html_soup.find(id="pankuzu").text.strip()

	def html_parse_remove_figures(html_soup):
	local = html_soup
	while (1):
	if "<figure>" in str(html_soup):
	local.figure.extract()
	else:
	return html_soup

	def html_parse_remove_extras(html_soup):
	local = html_soup
	try:
	local.find(class_="quiz").extract()
	except:
	pass

	try:
	local.find(class_="change_page").extract()
	except:
	pass

	try:
	local.find(id="other_post").extract()
	except:
	pass

	return local

	def html_parse_add_rubies_parenthesis(html_soup):
	local = html_soup
	# add optional () to ruby
	rubies = local.find_all("rt")

	for ruby in rubies:
	tag_open = local.new_tag("rp")
	tag_close = local.new_tag("rp")
	tag_open.string = "("
	tag_close.string = ")"

	ruby.insert_before(tag_open)
	ruby.insert_after(tag_close)
	return local


	def html_parse_remove_videos(html_soup):
	local = html_soup
	# if video, remove them but print message
	videos = local.find_all("div",class_="eq_movie_article")
	videos_rating = local.find_all("div",class_="eq_good")
	if videos:
	print(" !!! article has video/s")
	for video in videos:
	video.extract()
	for rating in videos_rating:
	rating.extract()
	return local


	def process_article(html_original):
	''' Process using Beautiful Soup
	:gets: text/string
	:return: text/string
	'''

	soup = BeautifulSoup(html_original, "html.parser")
	category = soup.find(id="pankuzu").text.strip()
	article = soup.find(id="topics_detail").div
	# dirty way to have title's content
	tmp = article.h2.extract()
	title = "<h1>"
	for child in tmp.children:
	title += str(child)
	title += "</h1>"
	title = BeautifulSoup(title, "html.parser")

	while (1):
	if "<figure>" in str(article):
	article.figure.extract()
	else:
	break

	try:
	article.find(class_="quiz").extract()
	except:
	pass

	try:
	article.find(class_="change_page").extract()
	except:
	pass

	#add controls to audio
	audio_tags = article.find_all("audio")
	for audio_tag in audio_tags:
	source = audio_tag.find("source")
	new_tag = soup.new_tag("audio")
	new_tag["controls"] = ""
	new_tag["preload"] = "auto"
	audio_tag.unwrap()
	source.wrap(new_tag)

	# if video, remove them but print message
	videos = article.find_all("div",class_="eq_movie_article")
	videos_rating = article.find_all("div",class_="eq_good")
	if videos:
	print(" !!! article has video/s")
	for video in videos:
	video.extract()
	for rating in videos_rating:
	rating.extract()

	article_english_site = False # not implemented!
	if article_english_site:
	# glossaries: remove tracking and change ul to dl
	glossaries = article.find_all("aside",class_="glossary")
	for glossary in glossaries:
	del glossary.p["onclick"]
	items = glossary.find_all("li")

	for li in items:
	dt = soup.new_tag("dt")
	dd = soup.new_tag("dd")

	tmp = li.contents
	dt.string = tmp[0].replace(" : ","")
	dd.string = tmp[1].string

	li.insert_before(dt)
	li.insert_after(dd)
	li.extract()

	dl = soup.new_tag("dl")
	glossary.ul.wrap(dl)
	glossary.ul.unwrap()


	# change dls (dd) to paragraphs
	dl_tags = article.find_all("dl", class_="voice_text_wrap")

	for dl_tag in dl_tags:
	new_tag = soup.new_tag("p")
	new_tag.contents = dl_tag.find("dd").contents
	dl_tag.insert_after(new_tag)
	dl_tag.extract()

	final_html = BeautifulSoup("<html><body><p>Category: " + category + "</p></body></html>", "html.parser")
	final_html.p.insert_after(article)
	final_html.p.insert_before(title)

	# don't know what happens, but parse again to make it work
	final_html = BeautifulSoup(final_html.prettify().replace('<div class="inner">',"<article>").replace("</div>","</article>"), "html.parser")

	# add optional () to ruby
	rubies = final_html.find_all("rt")

	for ruby in rubies:
	tag_open = soup.new_tag("rp")
	tag_close = soup.new_tag("rp")
	tag_open.string = "("
	tag_close.string = ")"

	ruby.insert_before(tag_open)
	ruby.insert_after(tag_close)


	return final_html.prettify()


	def process_shoku(html_original,slug):

	soup = BeautifulSoup(html_original, "html.parser")

	category = html_parse_get_category(soup)

	article = soup.find(id="shoku_detail")

	article = html_parse_remove_figures(article)
	article = html_parse_remove_extras(article)

	title = "食 - " + slug


	final_html = BeautifulSoup("<html><body><h1>" + title + "</h1><p>Category: " + category + "</p></body></html>", "html.parser")
	final_html.p.insert_after(article)

	# don't know what happens, but parse again to make it work
	final_html = BeautifulSoup(final_html.prettify().replace('<div class="inner">',"<article>").replace("</div>","</article>"), "html.parser")

	final_html = html_parse_add_rubies_parenthesis(final_html)

	return final_html.prettify()



	def process_watashi(html_original,slug):

	title = "トピックと私 - " + slug
	soup = BeautifulSoup(html_original, "html.parser")
	category = html_parse_get_category(soup)

	article = soup.find(id="hanashi_detail").div
	article = html_parse_remove_extras(article)

	article = html_parse_remove_videos(article)


	final_html = BeautifulSoup("<html><body><h1>" + title + "</h1><p>Category: " + category + "</p></body></html>", "html.parser")
	final_html.p.insert_after(article)

	# don't know what happens, but parse again to make it work
	final_html = BeautifulSoup(final_html.prettify().replace('<div class="inner">',"<article>").replace("</div>","</article>"), "html.parser")

	final_html = html_parse_add_rubies_parenthesis(final_html)

	return final_html.prettify()



	def html_to_txt(article_html):
	''' convert HTML to txt
	:gets: text/string
	:return: text/string
	'''

	soup = BeautifulSoup(article_html, "html.parser")

	title = ""
	for string in soup.h1.stripped_strings:
	title += string

	category = soup.p.string.strip()

	text = list()
	paragraphs = soup.article.find_all("p")

	for paragraph in paragraphs:
	tmp = ""
	for string in paragraph.stripped_strings:
	tmp += string
	text.append(tmp)
	text = (os.linesep + os.linesep).join(text)

	final_text = '''{title}
	================
	{category}


	{text}
	'''
	final_text = final_text.format(title=title,category=category,text=text)
	return final_text



	if not DEBUG:
	if len(sys.argv) > 1:
	if not os.path.exists(sys.argv[1]):
	print(" File doesnt exist. Why are you so mean?")
	exit()
	if not sys.argv[1].endswith(".csv") and not sys.argv[1].endswith(".txt"):
	print(" Not a valid file to read. Must be CSV/TXT")
	exit()

	UPDATE_CSV = sys.argv[1]

	# read csv that has urls
	# Format:
	# 0 category
	# 1 title
	# 2 url
	# .... etc





	with open(UPDATE_CSV, 'r', encoding="utf-8") as tmp_file:
	url_column = False
	if UPDATE_CSV.endswith(".csv"):
	tmp_rows = csv.reader(tmp_file, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
	while (not int(url_column)):
	url_column = int(input(" Column for url? "))
	if UPDATE_CSV.endswith(".txt"):
	tmp_rows = tmp_file.readlines()

	for line in tmp_rows:
	url = line
	if url_column:
	url = line[url_column]

	url = url[0:-1].replace("https://hirogaru-nihongo.jp/","") #[0:-1] to remove last /
	url_parts = url.split("/")

	if len(url_parts) < 3:
	continue

	slug = url_parts[-1]
	doc_type = ""
	category = url_parts[0]


	if "article" in url_parts and not "en" in url_parts:
	doc_type = "article"

	if "shoku" in url_parts and not "en" in url_parts:
	doc_type = "shoku"

	if "watashi-en" in url_parts: #brief translation
	category = url_parts[1]
	doc_type = "watashi"

	if doc_type:
	URLS.append(Article("https://hirogaru-nihongo.jp/" + url + "/", slug, category, doc_type))


	# download HTML using requests

	for item in URLS:
	r = requests.get(item.url)
	name = item.category_romaji + " > " + item.slug + "(" + item.type + ")"
	if not r:
	print(" couldn't get page for: " + name)
	continue

	print("processing: " + name)
	filename = item.category_romaji + "-" + item.type + "-" + item.slug

	if item.type == "article":
	final_html = process_article(r.text)
	elif item.type == "shoku":
	final_html = process_shoku(r.text, item.slug)
	elif item.type == "watashi":
	final_html = process_watashi(r.text, item.slug)

	final_txt = html_to_txt(final_html)

	with open(os.path.join(os.getcwd(), filename + ".html"), 'w', encoding="utf-8") as tmp_file:
	tmp_file.write(final_html)

	with open(os.path.join(os.getcwd(), filename + ".txt"), 'w', encoding="utf-8") as tmp_file:
	tmp_file.write(final_txt)