usernamenumber/process_html-3.py

## process_html-3.py
#!/usr/bin/python

##
##  This script does initial cleanup of an HTML export of the CSE book from gdocs
##  After cleanup, it will probably need to be imported back into gdocs for manual
##  tweaks.
##
##  Usage: just pass one or more HTML filenames as arguments.
##         Processed versions are output to `dstdir`, defined below.
##

import base64
import os, os.path
import lxml.etree
from lxml.cssselect import CSSSelector
import sys
dstdir = "processed-3"
no_content = ["","&nbsp;","&#160;"," ",None]
remove_tags = ["span","hr"]
remove_tags_if_empty = ["p"]
remove_attrs = ["class","style"]
force_classes = {
    "h1": "text-center title-heading",
    "h2": "text-center title-subheading",
}
tags_to_tags = {
    "h6": "h3",
}
classes_to_tags = {
    "span.c7": "em",
    "p.c37": "blockquote",
    "span.c59": "em",
    "span.c11": "em",
}

def remove_tag(s):
    parent = s.getparent()
    children = s.getchildren()
    previous = s.getprevious()
    if s.text not in no_content:
        if previous is not None:
            if previous.tail in no_content:
                previous.tail = ""
            previous.tail += s.text
        else:
            if parent.text in no_content:
                parent.text = ""
            parent.text += s.text
    for c in children:
        s.addprevious(c)
    if s.tail not in no_content:
        print "TAIL: %s" % s.tail
        if len(children) > 0:
            append_to = children[-1]
        else:
            append_to = parent
        if append_to.tail in ("","&nbsp;",None):
            append_to.tail = ""
        append_to.tail += s.tail
    parent.remove(s)

def has_text(e):
    if e.text not in no_content:
        return True
    for c in e.getiterator():
        if c.text not in no_content:
            return True
    return False

def is_empty(e):
    if e.text not in no_content:
        return False
    if len(e.getchildren()) > 0:
        return False
    return True

def clean(e):
    for c in e.getchildren():
        clean(c)
        if c.tag in remove_tags:
            remove_tag(c)
        elif c.tag in remove_tags_if_empty and is_empty(c):
            remove_tag(c)
        elif c.tag in tags_to_tags.keys():
            c.tag = tags_to_tags[c.tag]
    for a in remove_attrs:
        if e.attrib.has_key(a):
            del(e.attrib[a])
    if e.tag in force_classes.keys():
        e.set("class",force_classes[e.tag])


    return e

def convert_tags(e):
    for (c,t) in classes_to_tags.items():
        for m in CSSSelector(c)(e):
            mtext = lxml.etree.tostring(m)
            if m.text in no_content:
                continue
            p = m.getprevious()
            n = m.getnext()
            if p is not None and p.tag == t and p.tail in no_content:
                if p.text is None:
                    p.text = ""
                p.text += m.text
                if m.tail is not None:
                    p.tail = m.tail
                m.getparent().remove(m)
            elif n is not None and n.tag == t and m.tail in no_content:
                n.text = m.text + n.text
                m.getparent().remove(m)
            else:
                m.tag = t

def remove_page_numbers(h):
    # Assume that any <span> that is the only child of a <p>,
    # and contains only numbers is a page number.
    # (can't rely on css classes, since they appear to be
    # randomly named with each export)
    # c.f. http://lxml.de/xpathxslt.html#regular-expressions-in-xpath
    regexpNS = "http://exslt.org/regular-expressions"
    for pagenum in h.xpath("//span[parent::p[count(child::*) = 1] and re:test(.,'^[0-9\s]+$')]",namespaces={'re':regexpNS}):
        # TODO: since we actually want to remove the parent <p>
        # element, should probably rework the xpath statement
        # to just get it instead of the <span>.
        parent = pagenum.getparent()
        parent.getparent().remove(parent)

def inline_images(h):
    # Inline images
    for img in h.xpath("//img"):
        src = img.get("src")
        if src.startswith("data:"):
            continue
        ext = os.path.basename(src).split(os.path.extsep)[-1]
        newSrc = "data:image/%s;base64,%s" % (ext,base64.b64encode(open(os.path.join(os.path.dirname(fn),src),"r").read()))
        img.set("src",newSrc)

def anchor_subsections(h,tag="h6"):
    # Add subsection anchors
    subsection_num = 0
    for e in h.xpath("//"+tag):
        prev = e.getprevious()
        if prev is not None and prev.tag == "a" and prev.get("name","").startswith('subsection'):
            anchor = prev
        else:
            anchor = e.makeelement("a")
        subsection_num += 1
        anchor.set("name","subsection%s" % subsection_num)
        e.addprevious(anchor)

for fn in sys.argv[1:]:
    h = lxml.etree.HTML(open(fn,"r").read()).find("body")
    # Modify and clean up HTML
    remove_page_numbers(h)
    inline_images(h)
    anchor_subsections(h)
    #convert_tags(h)
    clean(h)
    # Add slick-carousel tags
    outer = h.makeelement("div")
    outer.set("class","page")
    h.tag = "div"
    h.set("class","page-content")
    outer.append(h)
    # Write the whole thing out to a new file
    if not os.path.exists(dstdir):
        os.mkdir(dstdir)
    outpath = os.path.join(dstdir,os.path.basename(fn))
    print "Writing %s" % outpath
    lxml.etree.ElementTree(outer).write(outpath,method = "html",pretty_print=True)
	#!/usr/bin/python

	##
	## This script does initial cleanup of an HTML export of the CSE book from gdocs
	## After cleanup, it will probably need to be imported back into gdocs for manual
	## tweaks.
	##
	## Usage: just pass one or more HTML filenames as arguments.
	## Processed versions are output to `dstdir`, defined below.
	##

	import base64
	import os, os.path
	import lxml.etree
	from lxml.cssselect import CSSSelector
	import sys
	dstdir = "processed-3"
	no_content = [""," "," "," ",None]
	remove_tags = ["span","hr"]
	remove_tags_if_empty = ["p"]
	remove_attrs = ["class","style"]
	force_classes = {
	"h1": "text-center title-heading",
	"h2": "text-center title-subheading",
	}
	tags_to_tags = {
	"h6": "h3",
	}
	classes_to_tags = {
	"span.c7": "em",
	"p.c37": "blockquote",
	"span.c59": "em",
	"span.c11": "em",
	}

	def remove_tag(s):
	parent = s.getparent()
	children = s.getchildren()
	previous = s.getprevious()
	if s.text not in no_content:
	if previous is not None:
	if previous.tail in no_content:
	previous.tail = ""
	previous.tail += s.text
	else:
	if parent.text in no_content:
	parent.text = ""
	parent.text += s.text
	for c in children:
	s.addprevious(c)
	if s.tail not in no_content:
	print "TAIL: %s" % s.tail
	if len(children) > 0:
	append_to = children[-1]
	else:
	append_to = parent
	if append_to.tail in (""," ",None):
	append_to.tail = ""
	append_to.tail += s.tail
	parent.remove(s)

	def has_text(e):
	if e.text not in no_content:
	return True
	for c in e.getiterator():
	if c.text not in no_content:
	return True
	return False

	def is_empty(e):
	if e.text not in no_content:
	return False
	if len(e.getchildren()) > 0:
	return False
	return True

	def clean(e):
	for c in e.getchildren():
	clean(c)
	if c.tag in remove_tags:
	remove_tag(c)
	elif c.tag in remove_tags_if_empty and is_empty(c):
	remove_tag(c)
	elif c.tag in tags_to_tags.keys():
	c.tag = tags_to_tags[c.tag]
	for a in remove_attrs:
	if e.attrib.has_key(a):
	del(e.attrib[a])
	if e.tag in force_classes.keys():
	e.set("class",force_classes[e.tag])


	return e

	def convert_tags(e):
	for (c,t) in classes_to_tags.items():
	for m in CSSSelector(c)(e):
	mtext = lxml.etree.tostring(m)
	if m.text in no_content:
	continue
	p = m.getprevious()
	n = m.getnext()
	if p is not None and p.tag == t and p.tail in no_content:
	if p.text is None:
	p.text = ""
	p.text += m.text
	if m.tail is not None:
	p.tail = m.tail
	m.getparent().remove(m)
	elif n is not None and n.tag == t and m.tail in no_content:
	n.text = m.text + n.text
	m.getparent().remove(m)
	else:
	m.tag = t

	def remove_page_numbers(h):
	# Assume that any <span> that is the only child of a <p>,
	# and contains only numbers is a page number.
	# (can't rely on css classes, since they appear to be
	# randomly named with each export)
	# c.f. http://lxml.de/xpathxslt.html#regular-expressions-in-xpath
	regexpNS = "http://exslt.org/regular-expressions"
	for pagenum in h.xpath("//span[parent::p[count(child::*) = 1] and re:test(.,'^[0-9\s]+$')]",namespaces={'re':regexpNS}):
	# TODO: since we actually want to remove the parent <p>
	# element, should probably rework the xpath statement
	# to just get it instead of the <span>.
	parent = pagenum.getparent()
	parent.getparent().remove(parent)

	def inline_images(h):
	# Inline images
	for img in h.xpath("//img"):
	src = img.get("src")
	if src.startswith("data:"):
	continue
	ext = os.path.basename(src).split(os.path.extsep)[-1]
	newSrc = "data:image/%s;base64,%s" % (ext,base64.b64encode(open(os.path.join(os.path.dirname(fn),src),"r").read()))
	img.set("src",newSrc)

	def anchor_subsections(h,tag="h6"):
	# Add subsection anchors
	subsection_num = 0
	for e in h.xpath("//"+tag):
	prev = e.getprevious()
	if prev is not None and prev.tag == "a" and prev.get("name","").startswith('subsection'):
	anchor = prev
	else:
	anchor = e.makeelement("a")
	subsection_num += 1
	anchor.set("name","subsection%s" % subsection_num)
	e.addprevious(anchor)

	for fn in sys.argv[1:]:
	h = lxml.etree.HTML(open(fn,"r").read()).find("body")
	# Modify and clean up HTML
	remove_page_numbers(h)
	inline_images(h)
	anchor_subsections(h)
	#convert_tags(h)
	clean(h)
	# Add slick-carousel tags
	outer = h.makeelement("div")
	outer.set("class","page")
	h.tag = "div"
	h.set("class","page-content")
	outer.append(h)
	# Write the whole thing out to a new file
	if not os.path.exists(dstdir):
	os.mkdir(dstdir)
	outpath = os.path.join(dstdir,os.path.basename(fn))
	print "Writing %s" % outpath
	lxml.etree.ElementTree(outer).write(outpath,method = "html",pretty_print=True)