Skip to content

Instantly share code, notes, and snippets.

Last active December 21, 2015 04:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gabalese/6249780 to your computer and use it in GitHub Desktop.
Save gabalese/6249780 to your computer and use it in GitHub Desktop.
Make a list of every img alt attribute in htmls and print it to stdout
#! /usr/bin/env python
# file:
# Make a list of every img alt attribute in htmls and print to stdout
# Usage: from the command line, python <epub.epub>
from __future__ import print_function
import os
import sys
import zipfile as ZIP
from lxml import etree as ET
except ImportError:
from xml.etree import ElementTree as ET
lista = []
new_list = []
filelist = []
root_folder = ""
namespaces = {"opf": "{}", "dc": "{}"}
def altlist(infile):
global lista
global new_list
# init a tolerant parser
parser = ET.HTMLParser()
html = ET.fromstring(ZIP.ZipFile(sys.argv[1]).read(root_folder + "/" + infile), parser)
for i in html.iter():
if i.tag == "img":
string = "{} tag in {} contains alt text '{}'".format(i, infile, i.get("alt"))
except AttributeError:
print("img tag in '{}' does not have a alt attribute!".format(infile))
def parseInfo(filename):
info = {}
global root_folder
f = ZIP.ZipFile(filename).read("META-INF/container.xml")
except KeyError:
print("The %s file is not a valid OCF." % str(filename))
f = ET.fromstring(f)
info["path_to_opf"] = f[0][0].get("full-path")
root_folder = os.path.dirname(info["path_to_opf"])
opf = ET.fromstring(ZIP.ZipFile(filename).read(info["path_to_opf"]))
toc_id = opf.find("{0}spine".format(namespaces["opf"])).get("toc")
expr = ".//*[@id='%s']" % toc_id
info["ncx_name"] = opf.find(expr).get("href")
info["path_to_ncx"] = root_folder + "/" + info["ncx_name"]
return info
def parseOPF(filename):
opf = ET.fromstring(ZIP.ZipFile(filename).read(parseInfo(filename)["path_to_opf"]))
return opf
if __name__ == "__main__":
if len(sys.argv) != 2:
print("USAGE: ./ <epubfile>")
i = sys.argv[1]
opf = parseOPF(i)
for item in opf.iter():
if item.get("media-type") == "application/xhtml+xml":
for item in filelist:
for i in lista:
for i in new_list:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment