jamesoutterside/epub_inspect.py

## epub_inspect.py
import zipfile

def get_epub_info(fname):
    zip = zipfile.ZipFile(fname)

    # find the contents metafile
    txt = zip.read('META-INF/container.xml')
    tree = etree.fromstring(txt)
    cfname = tree.xpath('n:rootfiles/n:rootfile/@full-path',namespaces=NS)[0]

    # grab the metadata block from the contents metafile
    cf = zip.read(cfname)
    tree = etree.fromstring(cf)
    metadata_p = tree.xpath('/pkg:package/pkg:metadata',namespaces=NS)[0]

    # grab mainfest and items within
    manifest_p = tree.xpath('/pkg:package/pkg:manifest',namespaces=NS)[0]
    items = manifest_p.xpath('/pkg:package/pkg:manifest/pkg:item[@media-type="application/xhtml+xml"]',namespaces=NS)

    # try for ncx file, used to build chapter list with acutal names
    try:
        ncx_name = tree.xpath('/pkg:package/pkg:manifest/pkg:item[@id ="ncx"]/@href', namespaces=NS)[0]
        ncx_txt = zip.read(ncx_name)
        ncx_tree = etree.fromstring(ncx_txt)
        items = ncx_tree.xpath('ncx:navMap/ncx:navPoint', namespaces=NS)

        chapters = []
        for item in items:
            title = item.xpath('ncx:navLabel/ncx:text', namespaces=NS)[0].text
            content = item.xpath('ncx:content/@src', namespaces=NS)[0]
            chapters.append([content,title])

            children_items = item.xpath('ncx:navPoint', namespaces=NS)

            for ci in children_items:
                ci_title = ci.xpath('ncx:navLabel/ncx:text', namespaces=NS)[0].text
                ci_content = ci.xpath('ncx:content/@src', namespaces=NS)[0]
                chapters.append([ci_content,ci_title])

    except Exception as e:
        items = manifest_p.xpath('/pkg:package/pkg:manifest/pkg:item[@media-type="application/xhtml+xml"]',namespaces=NS)
        chapters = [[i.attrib['href'],i.attrib['href']] for i in items]

    # repackage the metadata
    res = {}
    for s in ['title','language','creator','date','identifier', 'rights', 'publisher']:
        try:
            res[s] = metadata_p.xpath('dc:%s/text()'%(s),namespaces=NS)[0]
        except:
            pass
            #raise warning or exception here
    return res, chapters


def get_epub_images(file_name_and_path):
  zip = zipfile.ZipFile(file_name_and_path)
  extensions = {".jpg", ".png", ".gif"}
  images = [file for file in zip.namelist() if os.path.splitext(file)[1] in extensions]
  for n in images:
      zip.extract(n, 'extract from path')
	import zipfile

	def get_epub_info(fname):
	zip = zipfile.ZipFile(fname)

	# find the contents metafile
	txt = zip.read('META-INF/container.xml')
	tree = etree.fromstring(txt)
	cfname = tree.xpath('n:rootfiles/n:rootfile/@full-path',namespaces=NS)[0]

	# grab the metadata block from the contents metafile
	cf = zip.read(cfname)
	tree = etree.fromstring(cf)
	metadata_p = tree.xpath('/pkg:package/pkg:metadata',namespaces=NS)[0]

	# grab mainfest and items within
	manifest_p = tree.xpath('/pkg:package/pkg:manifest',namespaces=NS)[0]
	items = manifest_p.xpath('/pkg:package/pkg:manifest/pkg:item[@media-type="application/xhtml+xml"]',namespaces=NS)

	# try for ncx file, used to build chapter list with acutal names
	try:
	ncx_name = tree.xpath('/pkg:package/pkg:manifest/pkg:item[@id ="ncx"]/@href', namespaces=NS)[0]
	ncx_txt = zip.read(ncx_name)
	ncx_tree = etree.fromstring(ncx_txt)
	items = ncx_tree.xpath('ncx:navMap/ncx:navPoint', namespaces=NS)

	chapters = []
	for item in items:
	title = item.xpath('ncx:navLabel/ncx:text', namespaces=NS)[0].text
	content = item.xpath('ncx:content/@src', namespaces=NS)[0]
	chapters.append([content,title])

	children_items = item.xpath('ncx:navPoint', namespaces=NS)

	for ci in children_items:
	ci_title = ci.xpath('ncx:navLabel/ncx:text', namespaces=NS)[0].text
	ci_content = ci.xpath('ncx:content/@src', namespaces=NS)[0]
	chapters.append([ci_content,ci_title])

	except Exception as e:
	items = manifest_p.xpath('/pkg:package/pkg:manifest/pkg:item[@media-type="application/xhtml+xml"]',namespaces=NS)
	chapters = [[i.attrib['href'],i.attrib['href']] for i in items]

	# repackage the metadata
	res = {}
	for s in ['title','language','creator','date','identifier', 'rights', 'publisher']:
	try:
	res[s] = metadata_p.xpath('dc:%s/text()'%(s),namespaces=NS)[0]
	except:
	pass
	#raise warning or exception here
	return res, chapters


	def get_epub_images(file_name_and_path):
	zip = zipfile.ZipFile(file_name_and_path)
	extensions = {".jpg", ".png", ".gif"}
	images = [file for file in zip.namelist() if os.path.splitext(file)[1] in extensions]
	for n in images:
	zip.extract(n, 'extract from path')