Created
October 2, 2015 17:49
-
-
Save kogecoo/2e7db09e52ddb4fe011d to your computer and use it in GitHub Desktop.
super tiny Aozora Bunko html2text converter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -* | |
from bs4 import BeautifulSoup as BS | |
import requests | |
import argparse | |
def convert(url, out, midashi, need_title, need_author): | |
title = "" | |
author = "" | |
req = requests.get(url) | |
text = req.content | |
soup = BS(text, "html.parser") | |
if need_title: | |
title = soup.find(class_="title").getText() + "\n" | |
if need_author: | |
author = soup.find(class_="author").getText() + "\n" | |
main_text = soup.find("div", class_="main_text") | |
# remove ruby text | |
[rt.extract() for rt in main_text.findAll("rt")] | |
[rp.extract() for rp in main_text.findAll("rp")] # remove ruby | |
# remove midashi | |
if not midashi: | |
[om.extract() for om in main_text.findAll(class_="o-midashi")] | |
[nm.extract() for nm in main_text.findAll(class_="naka-midashi")] | |
# replace <br /> with \n | |
#for br in main_text.select("br"): # <br /> to \n text node | |
# br.string = "\n" | |
# br.unwrap() | |
with open("{0}.txt".format(out), "w") as fw: | |
txt = title + author + main_text.getText() | |
# remove multiple \n | |
prev = "" | |
while prev != txt: | |
prev = txt | |
txt = txt.replace("\n\n", "\n") | |
fw.write(txt.encode("utf-8")) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Aozora html to text converter') | |
parser.add_argument('--url', metavar='URL', nargs='+', type=str, required=True, help='url(s) of aozora bunko html page') | |
parser.add_argument('--out', metavar='OUT.txt', nargs='+', type=str, required=True, help='name(s) of output text (need to be the same num of urls)') | |
parser.add_argument('--midashi', default=False, action="store_true", help='need midashi') | |
parser.add_argument('--title', default=False, action="store_true", help='need title') | |
parser.add_argument('--author', default=False, action="store_true", help='need author') | |
args = parser.parse_args() | |
if len(args.url) is not len(args.out): | |
print "urls and outs must be the same length" | |
for url, out in zip(args.url, args.out): | |
convert(url, out, args.midashi, args.title, args.author) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
prerequisite:
usage:
You can obtain tag removed text file named
output_filename.txt
with following command.Or you can specify multiple urls (and it requires same num of output_filename.txt needs to be specified)
If you need midashi, title or author texts, you can obtain it by appending following arg(s)
Real Example
Following example downloads 'The old man and the sea' and 'Kokomo' and save it to `oldmansea.txt' and 'kokomo.txt'. These text includes book's title because the command has --title flag.