kogecoo/ tinyaozora.py

## tinyaozora.py
#!/usr/bin/env python
# -*- coding: utf-8 -*


from bs4 import BeautifulSoup as BS
import requests
import argparse


def convert(url, out, midashi, need_title, need_author):
    title = ""
    author = ""

    req = requests.get(url)
    text = req.content
    soup = BS(text, "html.parser")

    if need_title:
        title = soup.find(class_="title").getText() + "\n"

    if need_author:
        author = soup.find(class_="author").getText() + "\n"

    main_text = soup.find("div", class_="main_text")

    # remove ruby text
    [rt.extract() for rt in main_text.findAll("rt")]
    [rp.extract() for rp in main_text.findAll("rp")]  # remove ruby

    # remove midashi
    if not midashi:
        [om.extract() for om in main_text.findAll(class_="o-midashi")]
        [nm.extract() for nm in main_text.findAll(class_="naka-midashi")]

    # replace <br /> with  \n
    #for br in main_text.select("br"):  # <br /> to \n text node
    #    br.string = "\n"
    #    br.unwrap()

    with open("{0}.txt".format(out), "w") as fw:
        txt = title + author + main_text.getText()

        # remove multiple \n
        prev = ""
        while prev != txt:
            prev = txt
            txt = txt.replace("\n\n", "\n")

        fw.write(txt.encode("utf-8"))


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Aozora html to text converter')
    parser.add_argument('--url', metavar='URL', nargs='+', type=str, required=True, help='url(s) of aozora bunko html page')
    parser.add_argument('--out', metavar='OUT.txt', nargs='+', type=str, required=True, help='name(s) of output text (need to be the same num of urls)')
    parser.add_argument('--midashi', default=False, action="store_true", help='need midashi')
    parser.add_argument('--title', default=False, action="store_true", help='need title')
    parser.add_argument('--author', default=False, action="store_true", help='need author')

    args = parser.parse_args()
    if len(args.url) is not len(args.out):
        print "urls and outs must be the same length"

    for url, out in zip(args.url, args.out):
        convert(url, out, args.midashi, args.title, args.author)
	#!/usr/bin/env python
	# -- coding: utf-8 -


	from bs4 import BeautifulSoup as BS
	import requests
	import argparse


	def convert(url, out, midashi, need_title, need_author):
	title = ""
	author = ""

	req = requests.get(url)
	text = req.content
	soup = BS(text, "html.parser")

	if need_title:
	title = soup.find(class_="title").getText() + "\n"

	if need_author:
	author = soup.find(class_="author").getText() + "\n"

	main_text = soup.find("div", class_="main_text")

	# remove ruby text
	[rt.extract() for rt in main_text.findAll("rt")]
	[rp.extract() for rp in main_text.findAll("rp")] # remove ruby

	# remove midashi
	if not midashi:
	[om.extract() for om in main_text.findAll(class_="o-midashi")]
	[nm.extract() for nm in main_text.findAll(class_="naka-midashi")]

	# replace <br /> with \n
	#for br in main_text.select("br"): # <br /> to \n text node
	# br.string = "\n"
	# br.unwrap()

	with open("{0}.txt".format(out), "w") as fw:
	txt = title + author + main_text.getText()

	# remove multiple \n
	prev = ""
	while prev != txt:
	prev = txt
	txt = txt.replace("\n\n", "\n")

	fw.write(txt.encode("utf-8"))


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Aozora html to text converter')
	parser.add_argument('--url', metavar='URL', nargs='+', type=str, required=True, help='url(s) of aozora bunko html page')
	parser.add_argument('--out', metavar='OUT.txt', nargs='+', type=str, required=True, help='name(s) of output text (need to be the same num of urls)')
	parser.add_argument('--midashi', default=False, action="store_true", help='need midashi')
	parser.add_argument('--title', default=False, action="store_true", help='need title')
	parser.add_argument('--author', default=False, action="store_true", help='need author')

	args = parser.parse_args()
	if len(args.url) is not len(args.out):
	print "urls and outs must be the same length"

	for url, out in zip(args.url, args.out):
	convert(url, out, args.midashi, args.title, args.author)