Skip to content

Instantly share code, notes, and snippets.

@gabalese
Last active December 21, 2015 04:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gabalese/6249780 to your computer and use it in GitHub Desktop.
Save gabalese/6249780 to your computer and use it in GitHub Desktop.
Make a list of every img alt attribute in htmls and print it to stdout
#! /usr/bin/env python
# file: attributealtfield.py
# Make a list of every img alt attribute in htmls and print to stdout
# Usage: from the command line, python attributealtfield.py <epub.epub>
from __future__ import print_function
import os
import sys
import zipfile as ZIP
try:
from lxml import etree as ET
except ImportError:
from xml.etree import ElementTree as ET
lista = []
new_list = []
filelist = []
root_folder = ""
namespaces = {"opf": "{http://www.idpf.org/2007/opf}", "dc": "{http://purl.org/dc/elements/1.1/}"}
def altlist(infile):
global lista
global new_list
# init a tolerant parser
parser = ET.HTMLParser()
html = ET.fromstring(ZIP.ZipFile(sys.argv[1]).read(root_folder + "/" + infile), parser)
for i in html.iter():
if i.tag == "img":
try:
string = "{} tag in {} contains alt text '{}'".format(i, infile, i.get("alt"))
lista.append(string)
except AttributeError:
print("img tag in '{}' does not have a alt attribute!".format(infile))
def parseInfo(filename):
info = {}
global root_folder
try:
f = ZIP.ZipFile(filename).read("META-INF/container.xml")
except KeyError:
print("The %s file is not a valid OCF." % str(filename))
else:
f = ET.fromstring(f)
info["path_to_opf"] = f[0][0].get("full-path")
root_folder = os.path.dirname(info["path_to_opf"])
opf = ET.fromstring(ZIP.ZipFile(filename).read(info["path_to_opf"]))
toc_id = opf.find("{0}spine".format(namespaces["opf"])).get("toc")
expr = ".//*[@id='%s']" % toc_id
info["ncx_name"] = opf.find(expr).get("href")
info["path_to_ncx"] = root_folder + "/" + info["ncx_name"]
info.pop("ncx_name")
return info
def parseOPF(filename):
opf = ET.fromstring(ZIP.ZipFile(filename).read(parseInfo(filename)["path_to_opf"]))
return opf
if __name__ == "__main__":
if len(sys.argv) != 2:
print("USAGE: ./attributealtfield.py <epubfile>")
sys.exit()
i = sys.argv[1]
opf = parseOPF(i)
for item in opf.iter():
if item.get("media-type") == "application/xhtml+xml":
filelist.append(item.get("href"))
for item in filelist:
altlist(item)
for i in lista:
new_list.append(i)
for i in new_list:
print(i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment