Skip to content

Instantly share code, notes, and snippets.

@davidfraser
Created March 19, 2013 08:24
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save davidfraser/5194475 to your computer and use it in GitHub Desktop.
Save davidfraser/5194475 to your computer and use it in GitHub Desktop.
Clean up XHTML by removing extraneous things - in particular those generated by copying and pasting out of Microsoft Office products
import cssutils
from xml.sax import saxutils
from lxml.html import tostring, fromstring, clean
from lxml import etree
import logging
class Cleaner(clean.Cleaner):
def clean_html(self, html):
if not isinstance(html, unicode):
raise ValueError('We only support cleaning unicode HTML fragments')
#We wrap the content up in an extra div tag (otherwise lxml does wierd things to it - like adding in <p> tags and stuff)
divnode = fromstring(u'<div>' + html + u'</div>')
self(divnode)
# Strip all class attributes
etree.strip_attributes(divnode, 'class')
for style in divnode.xpath("//@style"):
parent = style.getparent()
try:
cssStyle = cssutils.parseStyle(style)
except Exception, e:
logging.info("Style %s failed to parse with error %s." % (style, e))
parent.attrib.pop('style', None)
continue
# Set the line separator so that the style gets serialized
cssutils.ser.prefs.lineSeparator = ''
# Only allow valid style properties
cssutils.ser.prefs.validOnly = True
new_style = cssStyle.cssText
if not new_style.strip():
parent.attrib.pop('style', None)
else:
parent.attrib['style'] = new_style
# Drop all empty span tags
for span_tag in divnode.xpath("//span"):
if not span_tag.keys():
span_tag.drop_tag()
#Now unwrap the divnode (i.e. just serialize the children of our extra div node)
cleaned = saxutils.escape(divnode.text) if divnode.text else ''
for n in divnode:
cleaned += tostring(n, encoding = unicode, method = 'xml')
return cleaned
# We need safe_attrs_only set to False, otherwise it strips out style attributes completely
cleaner = Cleaner(safe_attrs_only=False)
clean_html = cleaner.clean_html
@scm
Copy link

scm commented Jun 12, 2013

Really useful.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment