gabalese/attributealtfield.py

## attributealtfield.py
#! /usr/bin/env python
# file: attributealtfield.py
# Make a list of every img alt attribute in htmls and print to stdout
# Usage: from the command line, python attributealtfield.py <epub.epub>

from __future__ import print_function
import os
import sys
import zipfile as ZIP

try:
    from lxml import etree as ET
except ImportError:
    from xml.etree import ElementTree as ET

lista = []
new_list = []
filelist = []
root_folder = ""
namespaces = {"opf": "{http://www.idpf.org/2007/opf}", "dc": "{http://purl.org/dc/elements/1.1/}"}


def altlist(infile):
    global lista
    global new_list
    # init a tolerant parser
    parser = ET.HTMLParser()
    html = ET.fromstring(ZIP.ZipFile(sys.argv[1]).read(root_folder + "/" + infile), parser)

    for i in html.iter():
        if i.tag == "img":
            try:
                string = "{} tag in {} contains alt text '{}'".format(i, infile, i.get("alt"))
                lista.append(string)
            except AttributeError:
                print("img tag in '{}' does not have a alt attribute!".format(infile))


def parseInfo(filename):
    info = {}
    global root_folder
    try:
        f = ZIP.ZipFile(filename).read("META-INF/container.xml")
    except KeyError:
        print("The %s file is not a valid OCF." % str(filename))
    else:
        f = ET.fromstring(f)
        info["path_to_opf"] = f[0][0].get("full-path")
        root_folder = os.path.dirname(info["path_to_opf"])
    opf = ET.fromstring(ZIP.ZipFile(filename).read(info["path_to_opf"]))

    toc_id = opf.find("{0}spine".format(namespaces["opf"])).get("toc")
    expr = ".//*[@id='%s']" % toc_id
    info["ncx_name"] = opf.find(expr).get("href")
    info["path_to_ncx"] = root_folder + "/" + info["ncx_name"]
    info.pop("ncx_name")
    return info


def parseOPF(filename):
    opf = ET.fromstring(ZIP.ZipFile(filename).read(parseInfo(filename)["path_to_opf"]))

    return opf


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("USAGE: ./attributealtfield.py <epubfile>")
        sys.exit()

    i = sys.argv[1]
    opf = parseOPF(i)
    for item in opf.iter():
        if item.get("media-type") == "application/xhtml+xml":
            filelist.append(item.get("href"))

    for item in filelist:
        altlist(item)

    for i in lista:
        new_list.append(i)

    for i in new_list:
        print(i)
	#! /usr/bin/env python
	# file: attributealtfield.py
	# Make a list of every img alt attribute in htmls and print to stdout
	# Usage: from the command line, python attributealtfield.py <epub.epub>

	from __future__ import print_function
	import os
	import sys
	import zipfile as ZIP

	try:
	from lxml import etree as ET
	except ImportError:
	from xml.etree import ElementTree as ET

	lista = []
	new_list = []
	filelist = []
	root_folder = ""
	namespaces = {"opf": "{http://www.idpf.org/2007/opf}", "dc": "{http://purl.org/dc/elements/1.1/}"}


	def altlist(infile):
	global lista
	global new_list
	# init a tolerant parser
	parser = ET.HTMLParser()
	html = ET.fromstring(ZIP.ZipFile(sys.argv[1]).read(root_folder + "/" + infile), parser)

	for i in html.iter():
	if i.tag == "img":
	try:
	string = "{} tag in {} contains alt text '{}'".format(i, infile, i.get("alt"))
	lista.append(string)
	except AttributeError:
	print("img tag in '{}' does not have a alt attribute!".format(infile))


	def parseInfo(filename):
	info = {}
	global root_folder
	try:
	f = ZIP.ZipFile(filename).read("META-INF/container.xml")
	except KeyError:
	print("The %s file is not a valid OCF." % str(filename))
	else:
	f = ET.fromstring(f)
	info["path_to_opf"] = f[0][0].get("full-path")
	root_folder = os.path.dirname(info["path_to_opf"])
	opf = ET.fromstring(ZIP.ZipFile(filename).read(info["path_to_opf"]))

	toc_id = opf.find("{0}spine".format(namespaces["opf"])).get("toc")
	expr = ".//*[@id='%s']" % toc_id
	info["ncx_name"] = opf.find(expr).get("href")
	info["path_to_ncx"] = root_folder + "/" + info["ncx_name"]
	info.pop("ncx_name")
	return info


	def parseOPF(filename):
	opf = ET.fromstring(ZIP.ZipFile(filename).read(parseInfo(filename)["path_to_opf"]))

	return opf


	if __name__ == "__main__":
	if len(sys.argv) != 2:
	print("USAGE: ./attributealtfield.py <epubfile>")
	sys.exit()

	i = sys.argv[1]
	opf = parseOPF(i)
	for item in opf.iter():
	if item.get("media-type") == "application/xhtml+xml":
	filelist.append(item.get("href"))

	for item in filelist:
	altlist(item)

	for i in lista:
	new_list.append(i)

	for i in new_list:
	print(i)