esehara/gist:1327644

## gistfile1.py
# -*- coding: utf-8 -*-
import os
import urllib2
import BeautifulSoup
import re
import zipfile
class Epub_maker(object):
    def __init__(self,filename,title,author,identi,workspace='./temp/',):
        """
        Initialize :: Create Directory and File.Epub needs there.
        ./(workspace)/
        ./(workspace)/mimetype
        ./(workspace)/META-INF/container.xml
        """
        self.filename = filename
        self.title = title
        self.author = author
        self.identi = identi
        workdir = workspace + filename + "/"

        #Create Directory
        os.mkdir(workdir)
        os.mkdir(workdir + "OEBPS")
        os.mkdir(workdir + "META-INF")

        #Create Minetype
        mimetype = open(workdir + "mimetype","w")
        mimetype.write("application/epub+zip\n")
        mimetype.close()

        #Create xml
        container_xml = open(workdir + 'META-INF/container.xml','w')
        make_xml = """<?xml version='1.0' encoding="UTF-8"?>
        <container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
            <rootfiles>
                <rootfile full-path="content.opf" media-type="application/oebps-package+xml" />
            </rootfiles>
        </container>
        """
        container_xml.write(make_xml)
        container_xml.close()
        self.workdir = workdir

    def make_opf(self):
        xml_header = u"""<?xml version="1.0" encoding="UTF-8"?>
        <package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId">
            <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
            <dc:title>%s</dc:title>
            <dc:creator opf:role="aut">%s</dc:creator>
            <dc:language>ja</dc:language>
            <dc:publisher>Epub Maker (Python) </dc:publisher>
            <dc:identifier id="BookId">%s</dc:identifier>
        </metadata>
        """ % (self.title,self.author,self.identi)

        xml_manifest_header = u"""
        <manifest>
            <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" />
        """

        xml_manifest_body = u""
        filelist = os.listdir(self.workdir + '/OEBPS/')
        filelist.pop(0)
        filelist.sort()
        for number,file in enumerate(filelist):
            xml_manifest_body += u"<item id='file%d' href='%s' media-type='application/xhtml+xml' /> " % (number + 1,str(number + 1) + '.xhtml')
        xml_manifest_body += u'</manifest>'
        xml_fooder = u"""
        <spine toc="ncx">
        """
        for number,file in enumerate(filelist):
            xml_fooder += u"""<itemref idref="file%d" />""" % (number + 1)
        xml_fooder += u"""
        </spine>
        </package>
        """
        print "[DEBUG] ---- Output OPF File ----"
        opffile = open(self.workdir + 'content.opf','w')
        print xml_header
        opffile.write(xml_header.encode('utf-8'))
        print xml_manifest_header
        opffile.write(xml_manifest_header.encode('utf-8'))
        print xml_manifest_body
        opffile.write(xml_manifest_body.encode('utf-8'))
        print xml_fooder
        opffile.write(xml_fooder.encode('utf-8'))
        opffile.close()
        self.filelist = filelist

    def make_ncx(self):
        xml_header = u"""<?xml version="1.0" encoding="UTF-8" ?>
        <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
            <head>
                <meta name='dtb:uid' content="%s" />
                <meta name='dtb:depth' content="1" />
                <meta name='dtb:totalPageCount' content="0" />
                <meta name='dtb:maxPageNumber' content="0" />
            </head>
            <docTitle>
                <text>%s</text>
            </docTitle>
            <docAuthor>
                <text>%s</text>
            </docAuthor>
        """ % (self.identi,self.title,self.author)

        xml_navMap = u"<navMap>"
        for number,files in enumerate(self.filelist):
            xml_navMap += u"""<navPoint id='file%d' playOrder='%d'>
                                <navLabel>
                                    <text>%s</text>
                                </navLabel>
                             <content src="%s" />
                             </navPoint>
                            """ % (number + 1,number + 1,number + 1,"OEBPS/" + str(number) + '.xhtml')
        xml_navMap += u"</navMap>"
        xml_navMap += u"</ncx>"
        ncxfile = open(self.workdir + 'toc.ncx','w')
        print "[DEBUG] ---- Output Ncx File ----"
        print xml_header
        ncxfile.write(xml_header.encode('utf-8'))
        print xml_navMap
        ncxfile.write(xml_navMap.encode('utf-8'))
        ncxfile.close()

    def make_zip(self):
        epub_zip = zipfile.ZipFile(re.sub(' ','',self.filename) + '.epub','w',zipfile.ZIP_DEFLATED)
        filelist = os.listdir(self.workdir)
        for file in filelist:
            epub_zip.write(self.workdir + file,file)
        filelist = os.listdir(self.workdir + 'META-INF')
        for file in filelist:
            epub_zip.write(self.workdir + 'META-INF/' + file,'./META-INF/' + file)
        filelist = os.listdir(self.workdir + 'OEBPS')
        for file in filelist:
            epub_zip.write(self.workdir + 'OEBPS/' + file,'./OEBPS/' + file)
        epub_zip.close()

class ProjectEular(object):
    def gen(self):
        opener = urllib2.build_opener()
        epub = Epub_maker(filename = "peuler",
                          title    = "Project Euler",
                          author   = "Project Euler",
                          identi = "http://odz.sakura.ne.jp/projecteuler/index.php")
        html = opener.open("http://odz.sakura.ne.jp/projecteuler/index.php").read()
        mainsoup = BeautifulSoup.BeautifulSoup(html)
        subsoup = mainsoup.findAll("ul",{"class":"list1"})[2].findAll("a")
        for num,soup in enumerate(subsoup):
            print soup.text
            print soup["href"]
            gen_file = open(epub.workdir + "OEBPS/" + str(num) + ".xhtml","w")
            gen_file.write(self.gen_xml(soup["href"]).encode("utf-8"))
            gen_file.close()
        epub.make_opf()
        epub.make_ncx()
        epub.make_zip()

    def gen_xml(self,page):
        opener = urllib2.build_opener()
        header = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//JP" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="ja" lang="ja">
            <head>
        '''

        target_page = opener.open(page).read()
        soup = BeautifulSoup.BeautifulSoup(target_page)
        title = soup.find("h1").text
        body_str = ""
        for num,body_soup in enumerate(soup.findAll("p")):
            if num + 1 == len(soup.findAll("p")):
                break
            body_str += "<p>%s</p>\n" % body_soup.text

        header += '''
            <title>%s</title>
        </head>
        <body>
        ''' % title

        body = '''
        <h1>%s</h1>
        %s
        ''' % (title,body_str)

        fooder = '''
        </body>
</html>
        '''
        return (header + body + fooder)

if __name__ == "__main__":ProjectEular().gen()
	# -- coding: utf-8 --
	import os
	import urllib2
	import BeautifulSoup
	import re
	import zipfile
	class Epub_maker(object):
	def __init__(self,filename,title,author,identi,workspace='./temp/',):
	"""
	Initialize :: Create Directory and File.Epub needs there.
	./(workspace)/
	./(workspace)/mimetype
	./(workspace)/META-INF/container.xml
	"""
	self.filename = filename
	self.title = title
	self.author = author
	self.identi = identi
	workdir = workspace + filename + "/"

	#Create Directory
	os.mkdir(workdir)
	os.mkdir(workdir + "OEBPS")
	os.mkdir(workdir + "META-INF")

	#Create Minetype
	mimetype = open(workdir + "mimetype","w")
	mimetype.write("application/epub+zip\n")
	mimetype.close()

	#Create xml
	container_xml = open(workdir + 'META-INF/container.xml','w')
	make_xml = """<?xml version='1.0' encoding="UTF-8"?>
	<container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
	<rootfiles>
	<rootfile full-path="content.opf" media-type="application/oebps-package+xml" />
	</rootfiles>
	</container>
	"""
	container_xml.write(make_xml)
	container_xml.close()
	self.workdir = workdir

	def make_opf(self):
	xml_header = u"""<?xml version="1.0" encoding="UTF-8"?>
	<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId">
	<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
	<dc:title>%s</dc:title>
	<dc:creator opf:role="aut">%s</dc:creator>
	<dc:language>ja</dc:language>
	<dc:publisher>Epub Maker (Python) </dc:publisher>
	<dc:identifier id="BookId">%s</dc:identifier>
	</metadata>
	""" % (self.title,self.author,self.identi)

	xml_manifest_header = u"""
	<manifest>
	<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" />
	"""

	xml_manifest_body = u""
	filelist = os.listdir(self.workdir + '/OEBPS/')
	filelist.pop(0)
	filelist.sort()
	for number,file in enumerate(filelist):
	xml_manifest_body += u"<item id='file%d' href='%s' media-type='application/xhtml+xml' /> " % (number + 1,str(number + 1) + '.xhtml')
	xml_manifest_body += u'</manifest>'
	xml_fooder = u"""
	<spine toc="ncx">
	"""
	for number,file in enumerate(filelist):
	xml_fooder += u"""<itemref idref="file%d" />""" % (number + 1)
	xml_fooder += u"""
	</spine>
	</package>
	"""
	print "[DEBUG] ---- Output OPF File ----"
	opffile = open(self.workdir + 'content.opf','w')
	print xml_header
	opffile.write(xml_header.encode('utf-8'))
	print xml_manifest_header
	opffile.write(xml_manifest_header.encode('utf-8'))
	print xml_manifest_body
	opffile.write(xml_manifest_body.encode('utf-8'))
	print xml_fooder
	opffile.write(xml_fooder.encode('utf-8'))
	opffile.close()
	self.filelist = filelist

	def make_ncx(self):
	xml_header = u"""<?xml version="1.0" encoding="UTF-8" ?>
	<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
	<head>
	<meta name='dtb:uid' content="%s" />
	<meta name='dtb:depth' content="1" />
	<meta name='dtb:totalPageCount' content="0" />
	<meta name='dtb:maxPageNumber' content="0" />
	</head>
	<docTitle>
	<text>%s</text>
	</docTitle>
	<docAuthor>
	<text>%s</text>
	</docAuthor>
	""" % (self.identi,self.title,self.author)

	xml_navMap = u"<navMap>"
	for number,files in enumerate(self.filelist):
	xml_navMap += u"""<navPoint id='file%d' playOrder='%d'>
	<navLabel>
	<text>%s</text>
	</navLabel>
	<content src="%s" />
	</navPoint>
	""" % (number + 1,number + 1,number + 1,"OEBPS/" + str(number) + '.xhtml')
	xml_navMap += u"</navMap>"
	xml_navMap += u"</ncx>"
	ncxfile = open(self.workdir + 'toc.ncx','w')
	print "[DEBUG] ---- Output Ncx File ----"
	print xml_header
	ncxfile.write(xml_header.encode('utf-8'))
	print xml_navMap
	ncxfile.write(xml_navMap.encode('utf-8'))
	ncxfile.close()

	def make_zip(self):
	epub_zip = zipfile.ZipFile(re.sub(' ','',self.filename) + '.epub','w',zipfile.ZIP_DEFLATED)
	filelist = os.listdir(self.workdir)
	for file in filelist:
	epub_zip.write(self.workdir + file,file)
	filelist = os.listdir(self.workdir + 'META-INF')
	for file in filelist:
	epub_zip.write(self.workdir + 'META-INF/' + file,'./META-INF/' + file)
	filelist = os.listdir(self.workdir + 'OEBPS')
	for file in filelist:
	epub_zip.write(self.workdir + 'OEBPS/' + file,'./OEBPS/' + file)
	epub_zip.close()

	class ProjectEular(object):
	def gen(self):
	opener = urllib2.build_opener()
	epub = Epub_maker(filename = "peuler",
	title = "Project Euler",
	author = "Project Euler",
	identi = "http://odz.sakura.ne.jp/projecteuler/index.php")
	html = opener.open("http://odz.sakura.ne.jp/projecteuler/index.php").read()
	mainsoup = BeautifulSoup.BeautifulSoup(html)
	subsoup = mainsoup.findAll("ul",{"class":"list1"})[2].findAll("a")
	for num,soup in enumerate(subsoup):
	print soup.text
	print soup["href"]
	gen_file = open(epub.workdir + "OEBPS/" + str(num) + ".xhtml","w")
	gen_file.write(self.gen_xml(soup["href"]).encode("utf-8"))
	gen_file.close()
	epub.make_opf()
	epub.make_ncx()
	epub.make_zip()

	def gen_xml(self,page):
	opener = urllib2.build_opener()
	header = '''<?xml version="1.0" encoding="UTF-8"?>
	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//JP" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
	<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="ja" lang="ja">
	<head>
	'''

	target_page = opener.open(page).read()
	soup = BeautifulSoup.BeautifulSoup(target_page)
	title = soup.find("h1").text
	body_str = ""
	for num,body_soup in enumerate(soup.findAll("p")):
	if num + 1 == len(soup.findAll("p")):
	break
	body_str += "<p>%s</p>\n" % body_soup.text

	header += '''
	<title>%s</title>
	</head>
	<body>
	''' % title

	body = '''
	<h1>%s</h1>
	%s
	''' % (title,body_str)

	fooder = '''
	</body>
	</html>
	'''
	return (header + body + fooder)

	if __name__ == "__main__":ProjectEular().gen()