Skip to content

Instantly share code, notes, and snippets.

@avinassh
Forked from anqxyr/archived
Created October 27, 2015 05:38
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save avinassh/75ce4734d91493ff8298 to your computer and use it in GitHub Desktop.
Save avinassh/75ce4734d91493ff8298 to your computer and use it in GitHub Desktop.
Create EPUB files with Python
<?xml version='1.0' encoding='UTF-8'?>
<container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
<rootfiles>
<rootfile media-type="application/oebps-package+xml" full-path="content.opf"/>
</rootfiles>
</container>
<?xml version='1.0' encoding='UTF-8'?>
<package xmlns="http://www.idpf.org/2007/opf" prefix="rendition: http://www.ipdf.org/vocab/rendition/#" unique-identifier="uuid_id" version="3.0">
<metadata xmlns:opf="http://www.idpf.org/2007/opf" xmlns:dc="http://purl.org/dc/elements/1.1/">
<meta property="dcterms:modified"/>
<meta name="cover" content="cover"/>
<dc:title/>
<dc:creator/>
<dc:date/>
<dc:identifier id="uuid_id" opf:scheme="uuid"/>
<dc:language/>
</metadata>
<manifest>
<item href="stylesheet.css" id="stylesheet" media-type="text/css"/>
<item href="toc.ncx" id="toc" media-type="application/x-dtbncx+xml"/>
<item href="cover.png" id="cover" media-type="image/png" properties="cover-image"/>
</manifest>
<spine toc="toc"/>
</package>
#!/usr/bin/env python3
"""
Create Epub files.
This code was designed to provide a very simple and straight-forward API for
creating epub files, by sacrificing most of the versatility of the format.
Example usage:
>>> book = Book(title='Example Book', author='John Doe')
>>> with open('cover.png', 'br') as file:
>>> book.add_cover(file.read())
>>> with open('style.css') as file:
>>> book.add_stylesheet(file.read())
>>> book.add_page(title='First Page', content='some text')
>>> chapter = book.add_page(title='First Chapter', content='more text')
>>> book.add_page(
>>> title='Sub-Page 1',
>>> content='first subpage of the chapter',
>>> parent=chapter)
>>> with open('image.jpg', 'br') as file:
>>> book.add_image('image.jpg', file.read())
>>> book.save('example.epub')
"""
###############################################################################
# Module Imports
###############################################################################
import arrow
import collections
import itertools
import logging
import lxml.etree
import lxml.html
import pathlib
import pkgutil
import tempfile
import uuid
import zipfile
###############################################################################
log = logging.getLogger(__name__)
###############################################################################
class ETreeWrapper:
"""Convinience wrapper around xml trees."""
def __init__(self, *args, namespaces, **kwargs):
self.tree = lxml.etree.ElementTree(*args, **kwargs)
self.namespaces = namespaces
def __call__(self, tag='*', **kwargs):
path = './/{}'.format(tag)
for key, value in kwargs.items():
path += '[@{}="{}"]'.format(key, value)
return self.tree.find(path, namespaces=self.namespaces)
def __getattr__(self, name):
return getattr(self.tree, name)
def write(self, path):
self.tree.write(str(path), xml_declaration=True,
encoding='UTF-8', pretty_print=True)
def template(name):
"""Get file template."""
with open(name) as file:
template = file.read()
return ETreeWrapper(
lxml.etree.fromstring(
template,
lxml.etree.XMLParser(remove_blank_text=True)),
namespaces=dict(
opf='http://www.idpf.org/2007/opf',
dc='http://purl.org/dc/elements/1.1/',
xhtml='http://www.w3.org/1999/xhtml',
ncx='http://www.daisy.org/z3986/2005/ncx/'))
def flatten(tree):
for item in tree:
yield item
yield from flatten(item.children)
###############################################################################
Page = collections.namedtuple('Page', 'uid title children')
Image = collections.namedtuple('Image', 'name type')
class Book:
"""Wrapper around a epub archive."""
def __init__(self, **kwargs):
self.tempdir = tempfile.TemporaryDirectory()
self.root = []
self.images = []
self.uid_generator = map('{:04}'.format, itertools.count(1))
self.path = pathlib.Path(self.tempdir.name).resolve()
(self.path / 'pages').mkdir()
(self.path / 'images').mkdir()
self.title = kwargs.get('title', 'Untitled')
self.language = kwargs.get('language', 'en')
self.author = kwargs.get('author', 'Unknown Author')
def add_page(self, title, content, parent=None):
"""Add a new page/chapter to the root of the book."""
log.info('New page: {}'.format(title))
page = Page(next(self.uid_generator), title, [])
self.root.append(page) if not parent else parent.children.append(page)
file = template('page.xhtml')
file('xhtml:title').text = title
file('xhtml:body').append(lxml.html.fromstring(content))
file.write(self.path / 'pages' / (page.uid + '.xhtml'))
return page
def add_image(self, name, data):
log.info('New image: {}'.format(name))
if name.endswith('.jpg'):
media_type = 'image/jpeg'
if name.endswith('.png'):
media_type = 'image/png'
self.images.append(Image(name, media_type))
with open(str(self.path / 'images' / name), 'wb') as file:
file.write(data)
def add_cover(self, data):
with open(str(self.path / 'cover.png'), 'wb') as file:
file.write(data)
def add_stylesheet(self, data):
with open(str(self.path / 'stylesheet.css'), 'w') as file:
file.write(data)
def save(self, filename):
self._write_spine()
self._write_container()
self._write_toc()
with open(str(self.path / 'mimetype'), 'w') as file:
file.write('application/epub+zip')
with zipfile.ZipFile(filename, 'w') as archive:
archive.write(
str(self.path / 'mimetype'), 'mimetype',
compress_type=zipfile.ZIP_STORED)
for file in self.path.rglob('*.*'):
archive.write(
str(file), str(file.relative_to(self.path)),
compress_type=zipfile.ZIP_DEFLATED)
log.info('Book saved: {}'.format(self.title))
def _write_spine(self):
spine = template('content.opf')
now = arrow.utcnow().format('YYYY-MM-DDTHH:mm:ss')
spine(property='dcterms:modified').text = now
spine('dc:date').text = now
spine('dc:title').text = self.title
spine('dc:creator').text = self.author
spine('dc:language').text = self.language
spine(id='uuid_id').text = str(uuid.uuid4())
for page in flatten(self.root):
lxml.etree.SubElement(
spine('opf:manifest'), 'item',
href='pages/{}.xhtml'.format(page.uid), id=page.uid,
**{'media-type': 'application/xhtml+xml'})
lxml.etree.SubElement(
spine('opf:spine'), 'itemref', idref=page.uid)
for uid, image in enumerate(self.images):
lxml.etree.SubElement(
spine('opf:manifest'),
'item',
href='images/' + image.name,
id='img{:03}'.format(uid + 1),
**{'media-type': image.type})
spine.write(self.path / 'content.opf')
def _write_container(self):
container = template('container.xml')
meta_inf = self.path / 'META-INF'
meta_inf.mkdir()
container.write(meta_inf / 'container.xml')
def _write_toc(self):
toc = template('toc.ncx')
toc('ncx:text').text = self.title
for page in self.root:
self._page_to_toc(page, toc('ncx:navMap'))
toc.write(self.path / 'toc.ncx')
def _page_to_toc(self, page, node):
navpoint = lxml.etree.SubElement(
node, 'navPoint', id=page.uid, playOrder=page.uid.lstrip('0'))
navlabel = lxml.etree.SubElement(navpoint, 'navLabel')
lxml.etree.SubElement(navlabel, 'text').text = page.title
lxml.etree.SubElement(
navpoint, 'content', src='pages/{}.xhtml'.format(page.uid))
for child in page.children:
self._page_to_toc(child, navpoint)
<?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/#" lang="en" xml:lang="en">
<head>
<title/>
<link href="../stylesheet.css" rel="stylesheet" type="text/css"/>
</head>
<body/>
</html>
<?xml version='1.0' encoding='UTF-8'?>
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<head>
<meta content="" name="dtb:uid"/>
<meta content="0" name="dtb:depth"/>
<meta content="0" name="dtb:totalPageCount"/>
<meta content="0" name="dtb:maxPageNumber"/>
</head>
<docTitle>
<text/>
</docTitle>
<navMap/>
</ncx>
@CreativeMK
Copy link

lxml.etree.XMLParser(remove_blank_text=True)),

File "src/lxml/etree.pyx", line 3213, in lxml.etree.fromstring
File "src/lxml/parser.pxi", line 1872, in lxml.etree._parseMemoryDocument
ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment