esehara/aozora_to_epub.py

## aozora_to_epub.py
# -*- coding:utf-8 -*-

import urllib
import urllib2
from BeautifulSoup import BeautifulSoup
import re
import os
import zipfile

"""
使い方：
青空文庫の著者カードを下のURLに記述してください。
そのあとにスクリプトを起動すると、その作者で公開されている作品のXHTMLファイルを
全てダウンロードして、epubにまとめてくれます。

注意：
デフォルトの作業デフェクトリは "./temp/"になっています。
もしスクリプトが置いてあるフォルダに、
フォルダが存在しない場合は、"./temp/"を作成してください。
"""

author_url = 'http://www.aozora.gr.jp/index_pages/person281.html'

class Aozora_to_epub(object):
    def __init__(self,url):
        self.url = url
        self.epub = Epub_Make()
        self.author = Aozora_author_url()

    def get_content(self):
        self.author.get_url(self.url)
        self.epub.create_init(self.author.analyzer(self.epub.workspace))
        self.author.get_html()

    def make_epub(self):
        self.epub.make_opf(self.url)
        self.epub.make_ncx(self.url)
        self.epub.make_zip()

class Aozora_author_url(object):
    def __init__(self):
        self.opener = urllib2.build_opener()

    def get_url(self,url):
        html = self.opener.open(url).read()
        self.mainsoup = BeautifulSoup(html)

    def analyzer(self,workspace):
        self.worklist = []
        subsoup = self.mainsoup.find('ol')
        self.author=self.mainsoup.find('font').text
        for item in subsoup.findAll('a'):
            print item.text
            #print item['href']
            self.worklist.append({'title':item.text,'url':'http://www.aozora.gr.jp/index_pages/' + item['href']})
        self.works_get = Aozora_works_get(workspace + self.author)
        return self.author

    def get_html(self):
        for item in self.worklist:
            self.works_get.get_url(item['url'])
            self.works_get.analyzer()
            self.works_get.download()

class Aozora_works_get(object):
    def __init__(self,dir='./temp/'):
        self.opener = urllib2.build_opener()
        self.re_html = re.compile('.*html')
        self.save_dir = dir
        self.encode = System_charset()

    def get_url(self,url):
        html = self.opener.open(url).read()
        self.mainsoup = BeautifulSoup(html)
        url = url.split('/')
        url.pop()
        self.base_url = '/'.join(url)

    def analyzer(self):
        try:
            subsoup = self.mainsoup.findAll('table')[5]
            linksoup = subsoup.findAll('a')
            self.author_data = self.mainsoup.find('font').text
            for itemsoup in linksoup:
                if self.re_html.match(itemsoup['href']) != None:
                    #print u"[DEBUG]Download XHTML => " + itemsoup['href']
                    self.download_que = itemsoup['href']
                    return
        except:print '[DEBUG] Sorry, This File is Problem....'
        self.download_que = False

    def download(self):
        if self.download_que:
            #print u"[DEBUG]self.base_url => " + self.base_url
            #print u"[DEBUG]self.download_que =>" + self.download_que
            print u"[DEBUG]get url => " + self.base_url + '/' + self.download_que
            urllib.urlretrieve(self.base_url + '/' + self.download_que,self.save_dir + u'/text/' + self.author_data + u".xhtml")
            self.encode.work(self.save_dir + '/text/' + self.author_data + ".xhtml")

class System_charset(object):
    """
    ここのURLを参考:http://www.aozora.gr.jp/index_pages/person281.html
    """
    def guess_charset(self,data):
        f = lambda d, enc: d.decode(enc) and enc
        try: return f(data, 'utf-8')
        except: pass
        try: return f(data, 'shift-jis')
        except: pass
        try: return f(data, 'euc-jp')
        except: pass
        try: return f(data, 'iso2022-jp')
        except: pass
        return None

    def conv(self,data):
        charset = self.guess_charset(data)
        u = data.decode(charset)
        return u.encode('utf-8')

    def work(self,data):
        print "[DEBUG] Convert data UTF-8 Start :" + data
        convert_data = file(data,'rU')
        work_data = convert_data.read()
        work_data = re.sub('Shift_JIS','UTF-8',work_data)
        convert_data.close()

        try:
            work_data = self.conv(work_data)
        except:
            print "[DEBUG]",data,"-> skip"

        convert_data = file(data,'w')
        convert_data.write(work_data)
        convert_data.close()
        print "[DEBUG]",data,'-> Convert UTF-8'


class Epub_Make(object):
    def __init__(self,workspace='./temp/'):
        self.workspace=workspace
        self.author = 'hogehoge'

    def create_init(self,author):
        workdir = self.workspace + author + '/'
        self.author = author
        os.mkdir(workdir)
        os.mkdir(workdir + 'text')
        os.mkdir(workdir + 'META-INF')

        minetype = open(workdir + 'mimetype','w')
        minetype.write('application/epub+zip\n')
        minetype.close()

        container_xml = open(workdir + 'META-INF/container.xml','w')
        make_xml = """<?xml version='1.0' encoding="UTF-8"?>
        <container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
            <rootfiles>
                <rootfile full-path="content.opf" media-type="application/oebps-package+xml" />
            </rootfiles>
        </container>
        """
        container_xml.write(make_xml)
        container_xml.close()
        self.workdir = workdir

    def make_opf(self,url):
        xml_header = u"""<?xml version="1.0" encoding="UTF-8"?>
        <package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId">
            <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
            <dc:title>%s 全部</dc:title>
            <dc:creator opf:role="aut">%s</dc:creator>
            <dc:language>ja</dc:language>
            <dc:publisher>Aozora Bunko to Epub (青空文庫) </dc:publisher>
            <dc:identifier id="BookId">%s</dc:identifier>
        </metadata>
        """ % (self.author,self.author,url)

        xml_manifest_header = u"""
        <manifest>
            <item id="ncx" href="toc.ncx" media-type="text/xml" />
        """

        xml_manifest_body = u""
        filelist = os.listdir(self.workdir + '/text/')
        for number,file in enumerate(filelist):
            xml_manifest_body += u"<item id='file%d' href='text/%s' media-type='application/xhtml+xml' /> " % (number + 1,str(number + 1) + '.xhtml')
        xml_manifest_body += u'</manifest>'

        xml_fooder = u"""
        <spine toc="ncx">
            <itemref idref="file1" />
        </spine>
        </package>
        """

        print "[DEBUG] ---- Output OPF File ----"
        opffile = open(self.workdir + 'content.opf','w')
        print xml_header
        opffile.write(xml_header.encode('utf-8'))
        print xml_manifest_header
        opffile.write(xml_manifest_header.encode('utf-8'))
        print xml_manifest_body
        opffile.write(xml_manifest_body.encode('utf-8'))
        print xml_fooder
        opffile.write(xml_fooder.encode('utf-8'))
        opffile.close()

        self.filelist = filelist

    def make_ncx(self,url):
        xml_header = u"""<?xml version="1.0" encoding="UTF-8" ?>
        <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
            <head>
                <mata name='dtb:uid' content="%s" />
                <meta name='dtb:depth' content="1" />
                <meta name='dtb:totalPageCount' content="0" />
                <meta name='dtb:maxPageNumber' content="0" />
            </head>
            <docTitle>
                <text>%s 全部</text>
            </docTitle>
            <docAuthor>
                <text>%s</text>
            </docAuthor>
        """ % (url,self.author,self.author)

        xml_navMap = u"<navMap>"
        for number,file in enumerate(self.filelist):
            xml_navMap += u"""<navPoint id='file%d' playOrder='%d'>
                                <navLabel>
                                    <text>%s</text>
                                </navLabel>
                             <content src="text/%s" />
                             </navPoint>
                            """ % (number + 1,number + 1,file.split('.')[0],str(number + 1)+'.xhtml')
            os.rename(self.workdir + '/text/' + file, self.workdir + '/text/' + str(number + 1) + '.xhtml')
        xml_navMap += u"</navMap>"
        xml_navMap += u"</ncx>"

        ncxfile = open(self.workdir + 'toc.ncx','w')
        print "[DEBUG] ---- Output Ncx File ----"
        print xml_header
        ncxfile.write(xml_header.encode('utf-8'))
        print xml_navMap
        ncxfile.write(xml_navMap.encode('utf-8'))
        ncxfile.close()

    def make_zip(self):

        epub_zip = zipfile.ZipFile(re.sub(' ','',self.author) + '.epub','w',zipfile.ZIP_DEFLATED)
        filelist = os.listdir(self.workdir)
        for file in filelist:
            epub_zip.write(self.workdir + file,file)
        filelist = os.listdir(self.workdir + 'META-INF')
        for file in filelist:
            epub_zip.write(self.workdir + 'META-INF/' + file,'./META-INF/' + file)
        filelist = os.listdir(self.workdir + 'text')
        for file in filelist:
            epub_zip.write(self.workdir + 'text/' + file,'./text/' + file)
        epub_zip.close()

def test():
    global author_url
    test_def = Aozora_to_epub(author_url)
    test_def.get_content()
    test_def.make_epub()

if __name__ == '__main__':test()
	# -- coding:utf-8 --

	import urllib
	import urllib2
	from BeautifulSoup import BeautifulSoup
	import re
	import os
	import zipfile

	"""
	使い方：
	青空文庫の著者カードを下のURLに記述してください。
	そのあとにスクリプトを起動すると、その作者で公開されている作品のXHTMLファイルを
	全てダウンロードして、epubにまとめてくれます。

	注意：
	デフォルトの作業デフェクトリは "./temp/"になっています。
	もしスクリプトが置いてあるフォルダに、
	フォルダが存在しない場合は、"./temp/"を作成してください。
	"""

	author_url = 'http://www.aozora.gr.jp/index_pages/person281.html'

	class Aozora_to_epub(object):
	def __init__(self,url):
	self.url = url
	self.epub = Epub_Make()
	self.author = Aozora_author_url()

	def get_content(self):
	self.author.get_url(self.url)
	self.epub.create_init(self.author.analyzer(self.epub.workspace))
	self.author.get_html()

	def make_epub(self):
	self.epub.make_opf(self.url)
	self.epub.make_ncx(self.url)
	self.epub.make_zip()

	class Aozora_author_url(object):
	def __init__(self):
	self.opener = urllib2.build_opener()

	def get_url(self,url):
	html = self.opener.open(url).read()
	self.mainsoup = BeautifulSoup(html)

	def analyzer(self,workspace):
	self.worklist = []
	subsoup = self.mainsoup.find('ol')
	self.author=self.mainsoup.find('font').text
	for item in subsoup.findAll('a'):
	print item.text
	#print item['href']
	self.worklist.append({'title':item.text,'url':'http://www.aozora.gr.jp/index_pages/' + item['href']})
	self.works_get = Aozora_works_get(workspace + self.author)
	return self.author

	def get_html(self):
	for item in self.worklist:
	self.works_get.get_url(item['url'])
	self.works_get.analyzer()
	self.works_get.download()

	class Aozora_works_get(object):
	def __init__(self,dir='./temp/'):
	self.opener = urllib2.build_opener()
	self.re_html = re.compile('.*html')
	self.save_dir = dir
	self.encode = System_charset()

	def get_url(self,url):
	html = self.opener.open(url).read()
	self.mainsoup = BeautifulSoup(html)
	url = url.split('/')
	url.pop()
	self.base_url = '/'.join(url)

	def analyzer(self):
	try:
	subsoup = self.mainsoup.findAll('table')[5]
	linksoup = subsoup.findAll('a')
	self.author_data = self.mainsoup.find('font').text
	for itemsoup in linksoup:
	if self.re_html.match(itemsoup['href']) != None:
	#print u"[DEBUG]Download XHTML => " + itemsoup['href']
	self.download_que = itemsoup['href']
	return
	except:print '[DEBUG] Sorry, This File is Problem....'
	self.download_que = False

	def download(self):
	if self.download_que:
	#print u"[DEBUG]self.base_url => " + self.base_url
	#print u"[DEBUG]self.download_que =>" + self.download_que
	print u"[DEBUG]get url => " + self.base_url + '/' + self.download_que
	urllib.urlretrieve(self.base_url + '/' + self.download_que,self.save_dir + u'/text/' + self.author_data + u".xhtml")
	self.encode.work(self.save_dir + '/text/' + self.author_data + ".xhtml")

	class System_charset(object):
	"""
	ここのURLを参考:http://www.aozora.gr.jp/index_pages/person281.html
	"""
	def guess_charset(self,data):
	f = lambda d, enc: d.decode(enc) and enc
	try: return f(data, 'utf-8')
	except: pass
	try: return f(data, 'shift-jis')
	except: pass
	try: return f(data, 'euc-jp')
	except: pass
	try: return f(data, 'iso2022-jp')
	except: pass
	return None

	def conv(self,data):
	charset = self.guess_charset(data)
	u = data.decode(charset)
	return u.encode('utf-8')

	def work(self,data):
	print "[DEBUG] Convert data UTF-8 Start :" + data
	convert_data = file(data,'rU')
	work_data = convert_data.read()
	work_data = re.sub('Shift_JIS','UTF-8',work_data)
	convert_data.close()

	try:
	work_data = self.conv(work_data)
	except:
	print "[DEBUG]",data,"-> skip"

	convert_data = file(data,'w')
	convert_data.write(work_data)
	convert_data.close()
	print "[DEBUG]",data,'-> Convert UTF-8'



	class Epub_Make(object):
	def __init__(self,workspace='./temp/'):
	self.workspace=workspace
	self.author = 'hogehoge'

	def create_init(self,author):
	workdir = self.workspace + author + '/'
	self.author = author
	os.mkdir(workdir)
	os.mkdir(workdir + 'text')
	os.mkdir(workdir + 'META-INF')

	minetype = open(workdir + 'mimetype','w')
	minetype.write('application/epub+zip\n')
	minetype.close()

	container_xml = open(workdir + 'META-INF/container.xml','w')
	make_xml = """<?xml version='1.0' encoding="UTF-8"?>
	<container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
	<rootfiles>
	<rootfile full-path="content.opf" media-type="application/oebps-package+xml" />
	</rootfiles>
	</container>
	"""
	container_xml.write(make_xml)
	container_xml.close()
	self.workdir = workdir

	def make_opf(self,url):
	xml_header = u"""<?xml version="1.0" encoding="UTF-8"?>
	<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId">
	<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
	<dc:title>%s 全部</dc:title>
	<dc:creator opf:role="aut">%s</dc:creator>
	<dc:language>ja</dc:language>
	<dc:publisher>Aozora Bunko to Epub (青空文庫) </dc:publisher>
	<dc:identifier id="BookId">%s</dc:identifier>
	</metadata>
	""" % (self.author,self.author,url)

	xml_manifest_header = u"""
	<manifest>
	<item id="ncx" href="toc.ncx" media-type="text/xml" />
	"""

	xml_manifest_body = u""
	filelist = os.listdir(self.workdir + '/text/')
	for number,file in enumerate(filelist):
	xml_manifest_body += u"<item id='file%d' href='text/%s' media-type='application/xhtml+xml' /> " % (number + 1,str(number + 1) + '.xhtml')
	xml_manifest_body += u'</manifest>'

	xml_fooder = u"""
	<spine toc="ncx">
	<itemref idref="file1" />
	</spine>
	</package>
	"""

	print "[DEBUG] ---- Output OPF File ----"
	opffile = open(self.workdir + 'content.opf','w')
	print xml_header
	opffile.write(xml_header.encode('utf-8'))
	print xml_manifest_header
	opffile.write(xml_manifest_header.encode('utf-8'))
	print xml_manifest_body
	opffile.write(xml_manifest_body.encode('utf-8'))
	print xml_fooder
	opffile.write(xml_fooder.encode('utf-8'))
	opffile.close()

	self.filelist = filelist

	def make_ncx(self,url):
	xml_header = u"""<?xml version="1.0" encoding="UTF-8" ?>
	<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
	<head>
	<mata name='dtb:uid' content="%s" />
	<meta name='dtb:depth' content="1" />
	<meta name='dtb:totalPageCount' content="0" />
	<meta name='dtb:maxPageNumber' content="0" />
	</head>
	<docTitle>
	<text>%s 全部</text>
	</docTitle>
	<docAuthor>
	<text>%s</text>
	</docAuthor>
	""" % (url,self.author,self.author)

	xml_navMap = u"<navMap>"
	for number,file in enumerate(self.filelist):
	xml_navMap += u"""<navPoint id='file%d' playOrder='%d'>
	<navLabel>
	<text>%s</text>
	</navLabel>
	<content src="text/%s" />
	</navPoint>
	""" % (number + 1,number + 1,file.split('.')[0],str(number + 1)+'.xhtml')
	os.rename(self.workdir + '/text/' + file, self.workdir + '/text/' + str(number + 1) + '.xhtml')
	xml_navMap += u"</navMap>"
	xml_navMap += u"</ncx>"

	ncxfile = open(self.workdir + 'toc.ncx','w')
	print "[DEBUG] ---- Output Ncx File ----"
	print xml_header
	ncxfile.write(xml_header.encode('utf-8'))
	print xml_navMap
	ncxfile.write(xml_navMap.encode('utf-8'))
	ncxfile.close()

	def make_zip(self):

	epub_zip = zipfile.ZipFile(re.sub(' ','',self.author) + '.epub','w',zipfile.ZIP_DEFLATED)
	filelist = os.listdir(self.workdir)
	for file in filelist:
	epub_zip.write(self.workdir + file,file)
	filelist = os.listdir(self.workdir + 'META-INF')
	for file in filelist:
	epub_zip.write(self.workdir + 'META-INF/' + file,'./META-INF/' + file)
	filelist = os.listdir(self.workdir + 'text')
	for file in filelist:
	epub_zip.write(self.workdir + 'text/' + file,'./text/' + file)
	epub_zip.close()

	def test():
	global author_url
	test_def = Aozora_to_epub(author_url)
	test_def.get_content()
	test_def.make_epub()

	if __name__ == '__main__':test()