Skip to content

Instantly share code, notes, and snippets.

@usernamenumber
Last active May 14, 2016 01:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save usernamenumber/bf648d6e7d3ffe17b12759a4ae999aa6 to your computer and use it in GitHub Desktop.
Save usernamenumber/bf648d6e7d3ffe17b12759a4ae999aa6 to your computer and use it in GitHub Desktop.
Example script to convert external images into base64-encoded content embedded into the HTML, plus HTML cleanup and restructuring
#!/usr/bin/python
##
## This script does initial cleanup of an HTML export of the CSE book from gdocs
## After cleanup, it will probably need to be imported back into gdocs for manual
## tweaks.
##
## Usage: just pass one or more HTML filenames as arguments.
## Processed versions are output to `dstdir`, defined below.
##
import base64
import os, os.path
import lxml.etree
from lxml.cssselect import CSSSelector
import sys
dstdir = "processed-3"
no_content = [""," "," "," ",None]
remove_tags = ["span","hr"]
remove_tags_if_empty = ["p"]
remove_attrs = ["class","style"]
force_classes = {
"h1": "text-center title-heading",
"h2": "text-center title-subheading",
}
tags_to_tags = {
"h6": "h3",
}
classes_to_tags = {
"span.c7": "em",
"p.c37": "blockquote",
"span.c59": "em",
"span.c11": "em",
}
def remove_tag(s):
parent = s.getparent()
children = s.getchildren()
previous = s.getprevious()
if s.text not in no_content:
if previous is not None:
if previous.tail in no_content:
previous.tail = ""
previous.tail += s.text
else:
if parent.text in no_content:
parent.text = ""
parent.text += s.text
for c in children:
s.addprevious(c)
if s.tail not in no_content:
print "TAIL: %s" % s.tail
if len(children) > 0:
append_to = children[-1]
else:
append_to = parent
if append_to.tail in (""," ",None):
append_to.tail = ""
append_to.tail += s.tail
parent.remove(s)
def has_text(e):
if e.text not in no_content:
return True
for c in e.getiterator():
if c.text not in no_content:
return True
return False
def is_empty(e):
if e.text not in no_content:
return False
if len(e.getchildren()) > 0:
return False
return True
def clean(e):
for c in e.getchildren():
clean(c)
if c.tag in remove_tags:
remove_tag(c)
elif c.tag in remove_tags_if_empty and is_empty(c):
remove_tag(c)
elif c.tag in tags_to_tags.keys():
c.tag = tags_to_tags[c.tag]
for a in remove_attrs:
if e.attrib.has_key(a):
del(e.attrib[a])
if e.tag in force_classes.keys():
e.set("class",force_classes[e.tag])
return e
def convert_tags(e):
for (c,t) in classes_to_tags.items():
for m in CSSSelector(c)(e):
mtext = lxml.etree.tostring(m)
if m.text in no_content:
continue
p = m.getprevious()
n = m.getnext()
if p is not None and p.tag == t and p.tail in no_content:
if p.text is None:
p.text = ""
p.text += m.text
if m.tail is not None:
p.tail = m.tail
m.getparent().remove(m)
elif n is not None and n.tag == t and m.tail in no_content:
n.text = m.text + n.text
m.getparent().remove(m)
else:
m.tag = t
def remove_page_numbers(h):
# Assume that any <span> that is the only child of a <p>,
# and contains only numbers is a page number.
# (can't rely on css classes, since they appear to be
# randomly named with each export)
# c.f. http://lxml.de/xpathxslt.html#regular-expressions-in-xpath
regexpNS = "http://exslt.org/regular-expressions"
for pagenum in h.xpath("//span[parent::p[count(child::*) = 1] and re:test(.,'^[0-9\s]+$')]",namespaces={'re':regexpNS}):
# TODO: since we actually want to remove the parent <p>
# element, should probably rework the xpath statement
# to just get it instead of the <span>.
parent = pagenum.getparent()
parent.getparent().remove(parent)
def inline_images(h):
# Inline images
for img in h.xpath("//img"):
src = img.get("src")
if src.startswith("data:"):
continue
ext = os.path.basename(src).split(os.path.extsep)[-1]
newSrc = "data:image/%s;base64,%s" % (ext,base64.b64encode(open(os.path.join(os.path.dirname(fn),src),"r").read()))
img.set("src",newSrc)
def anchor_subsections(h,tag="h6"):
# Add subsection anchors
subsection_num = 0
for e in h.xpath("//"+tag):
prev = e.getprevious()
if prev is not None and prev.tag == "a" and prev.get("name","").startswith('subsection'):
anchor = prev
else:
anchor = e.makeelement("a")
subsection_num += 1
anchor.set("name","subsection%s" % subsection_num)
e.addprevious(anchor)
for fn in sys.argv[1:]:
h = lxml.etree.HTML(open(fn,"r").read()).find("body")
# Modify and clean up HTML
remove_page_numbers(h)
inline_images(h)
anchor_subsections(h)
#convert_tags(h)
clean(h)
# Add slick-carousel tags
outer = h.makeelement("div")
outer.set("class","page")
h.tag = "div"
h.set("class","page-content")
outer.append(h)
# Write the whole thing out to a new file
if not os.path.exists(dstdir):
os.mkdir(dstdir)
outpath = os.path.join(dstdir,os.path.basename(fn))
print "Writing %s" % outpath
lxml.etree.ElementTree(outer).write(outpath,method = "html",pretty_print=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment