Skip to content

Instantly share code, notes, and snippets.

@esehara
Created October 31, 2011 14:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save esehara/1327644 to your computer and use it in GitHub Desktop.
Save esehara/1327644 to your computer and use it in GitHub Desktop.
Project Euler (Japanese Translation) -> Epub
# -*- coding: utf-8 -*-
import os
import urllib2
import BeautifulSoup
import re
import zipfile
class Epub_maker(object):
def __init__(self,filename,title,author,identi,workspace='./temp/',):
"""
Initialize :: Create Directory and File.Epub needs there.
./(workspace)/
./(workspace)/mimetype
./(workspace)/META-INF/container.xml
"""
self.filename = filename
self.title = title
self.author = author
self.identi = identi
workdir = workspace + filename + "/"
#Create Directory
os.mkdir(workdir)
os.mkdir(workdir + "OEBPS")
os.mkdir(workdir + "META-INF")
#Create Minetype
mimetype = open(workdir + "mimetype","w")
mimetype.write("application/epub+zip\n")
mimetype.close()
#Create xml
container_xml = open(workdir + 'META-INF/container.xml','w')
make_xml = """<?xml version='1.0' encoding="UTF-8"?>
<container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
<rootfiles>
<rootfile full-path="content.opf" media-type="application/oebps-package+xml" />
</rootfiles>
</container>
"""
container_xml.write(make_xml)
container_xml.close()
self.workdir = workdir
def make_opf(self):
xml_header = u"""<?xml version="1.0" encoding="UTF-8"?>
<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
<dc:title>%s</dc:title>
<dc:creator opf:role="aut">%s</dc:creator>
<dc:language>ja</dc:language>
<dc:publisher>Epub Maker (Python) </dc:publisher>
<dc:identifier id="BookId">%s</dc:identifier>
</metadata>
""" % (self.title,self.author,self.identi)
xml_manifest_header = u"""
<manifest>
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" />
"""
xml_manifest_body = u""
filelist = os.listdir(self.workdir + '/OEBPS/')
filelist.pop(0)
filelist.sort()
for number,file in enumerate(filelist):
xml_manifest_body += u"<item id='file%d' href='%s' media-type='application/xhtml+xml' /> " % (number + 1,str(number + 1) + '.xhtml')
xml_manifest_body += u'</manifest>'
xml_fooder = u"""
<spine toc="ncx">
"""
for number,file in enumerate(filelist):
xml_fooder += u"""<itemref idref="file%d" />""" % (number + 1)
xml_fooder += u"""
</spine>
</package>
"""
print "[DEBUG] ---- Output OPF File ----"
opffile = open(self.workdir + 'content.opf','w')
print xml_header
opffile.write(xml_header.encode('utf-8'))
print xml_manifest_header
opffile.write(xml_manifest_header.encode('utf-8'))
print xml_manifest_body
opffile.write(xml_manifest_body.encode('utf-8'))
print xml_fooder
opffile.write(xml_fooder.encode('utf-8'))
opffile.close()
self.filelist = filelist
def make_ncx(self):
xml_header = u"""<?xml version="1.0" encoding="UTF-8" ?>
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<head>
<meta name='dtb:uid' content="%s" />
<meta name='dtb:depth' content="1" />
<meta name='dtb:totalPageCount' content="0" />
<meta name='dtb:maxPageNumber' content="0" />
</head>
<docTitle>
<text>%s</text>
</docTitle>
<docAuthor>
<text>%s</text>
</docAuthor>
""" % (self.identi,self.title,self.author)
xml_navMap = u"<navMap>"
for number,files in enumerate(self.filelist):
xml_navMap += u"""<navPoint id='file%d' playOrder='%d'>
<navLabel>
<text>%s</text>
</navLabel>
<content src="%s" />
</navPoint>
""" % (number + 1,number + 1,number + 1,"OEBPS/" + str(number) + '.xhtml')
xml_navMap += u"</navMap>"
xml_navMap += u"</ncx>"
ncxfile = open(self.workdir + 'toc.ncx','w')
print "[DEBUG] ---- Output Ncx File ----"
print xml_header
ncxfile.write(xml_header.encode('utf-8'))
print xml_navMap
ncxfile.write(xml_navMap.encode('utf-8'))
ncxfile.close()
def make_zip(self):
epub_zip = zipfile.ZipFile(re.sub(' ','',self.filename) + '.epub','w',zipfile.ZIP_DEFLATED)
filelist = os.listdir(self.workdir)
for file in filelist:
epub_zip.write(self.workdir + file,file)
filelist = os.listdir(self.workdir + 'META-INF')
for file in filelist:
epub_zip.write(self.workdir + 'META-INF/' + file,'./META-INF/' + file)
filelist = os.listdir(self.workdir + 'OEBPS')
for file in filelist:
epub_zip.write(self.workdir + 'OEBPS/' + file,'./OEBPS/' + file)
epub_zip.close()
class ProjectEular(object):
def gen(self):
opener = urllib2.build_opener()
epub = Epub_maker(filename = "peuler",
title = "Project Euler",
author = "Project Euler",
identi = "http://odz.sakura.ne.jp/projecteuler/index.php")
html = opener.open("http://odz.sakura.ne.jp/projecteuler/index.php").read()
mainsoup = BeautifulSoup.BeautifulSoup(html)
subsoup = mainsoup.findAll("ul",{"class":"list1"})[2].findAll("a")
for num,soup in enumerate(subsoup):
print soup.text
print soup["href"]
gen_file = open(epub.workdir + "OEBPS/" + str(num) + ".xhtml","w")
gen_file.write(self.gen_xml(soup["href"]).encode("utf-8"))
gen_file.close()
epub.make_opf()
epub.make_ncx()
epub.make_zip()
def gen_xml(self,page):
opener = urllib2.build_opener()
header = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//JP" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="ja" lang="ja">
<head>
'''
target_page = opener.open(page).read()
soup = BeautifulSoup.BeautifulSoup(target_page)
title = soup.find("h1").text
body_str = ""
for num,body_soup in enumerate(soup.findAll("p")):
if num + 1 == len(soup.findAll("p")):
break
body_str += "<p>%s</p>\n" % body_soup.text
header += '''
<title>%s</title>
</head>
<body>
''' % title
body = '''
<h1>%s</h1>
%s
''' % (title,body_str)
fooder = '''
</body>
</html>
'''
return (header + body + fooder)
if __name__ == "__main__":ProjectEular().gen()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment