Skip to content

Instantly share code, notes, and snippets.

@jamesoutterside
Created January 21, 2013 13:43
Show Gist options
  • Save jamesoutterside/4586153 to your computer and use it in GitHub Desktop.
Save jamesoutterside/4586153 to your computer and use it in GitHub Desktop.
Experimental code to look at an epub file and return metadata and a list of chapters
import zipfile
def get_epub_info(fname):
zip = zipfile.ZipFile(fname)
# find the contents metafile
txt = zip.read('META-INF/container.xml')
tree = etree.fromstring(txt)
cfname = tree.xpath('n:rootfiles/n:rootfile/@full-path',namespaces=NS)[0]
# grab the metadata block from the contents metafile
cf = zip.read(cfname)
tree = etree.fromstring(cf)
metadata_p = tree.xpath('/pkg:package/pkg:metadata',namespaces=NS)[0]
# grab mainfest and items within
manifest_p = tree.xpath('/pkg:package/pkg:manifest',namespaces=NS)[0]
items = manifest_p.xpath('/pkg:package/pkg:manifest/pkg:item[@media-type="application/xhtml+xml"]',namespaces=NS)
# try for ncx file, used to build chapter list with acutal names
try:
ncx_name = tree.xpath('/pkg:package/pkg:manifest/pkg:item[@id ="ncx"]/@href', namespaces=NS)[0]
ncx_txt = zip.read(ncx_name)
ncx_tree = etree.fromstring(ncx_txt)
items = ncx_tree.xpath('ncx:navMap/ncx:navPoint', namespaces=NS)
chapters = []
for item in items:
title = item.xpath('ncx:navLabel/ncx:text', namespaces=NS)[0].text
content = item.xpath('ncx:content/@src', namespaces=NS)[0]
chapters.append([content,title])
children_items = item.xpath('ncx:navPoint', namespaces=NS)
for ci in children_items:
ci_title = ci.xpath('ncx:navLabel/ncx:text', namespaces=NS)[0].text
ci_content = ci.xpath('ncx:content/@src', namespaces=NS)[0]
chapters.append([ci_content,ci_title])
except Exception as e:
items = manifest_p.xpath('/pkg:package/pkg:manifest/pkg:item[@media-type="application/xhtml+xml"]',namespaces=NS)
chapters = [[i.attrib['href'],i.attrib['href']] for i in items]
# repackage the metadata
res = {}
for s in ['title','language','creator','date','identifier', 'rights', 'publisher']:
try:
res[s] = metadata_p.xpath('dc:%s/text()'%(s),namespaces=NS)[0]
except:
pass
#raise warning or exception here
return res, chapters
def get_epub_images(file_name_and_path):
zip = zipfile.ZipFile(file_name_and_path)
extensions = {".jpg", ".png", ".gif"}
images = [file for file in zip.namelist() if os.path.splitext(file)[1] in extensions]
for n in images:
zip.extract(n, 'extract from path')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment