public
Created

Clean up XHTML by removing extraneous things - in particular those generated by copying and pasting out of Microsoft Office products

  • Download Gist
OfficeCleaner.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
import cssutils
from xml.sax import saxutils
from lxml.html import tostring, fromstring, clean
from lxml import etree
 
import logging
 
class Cleaner(clean.Cleaner):
def clean_html(self, html):
if not isinstance(html, unicode):
raise ValueError('We only support cleaning unicode HTML fragments')
 
#We wrap the content up in an extra div tag (otherwise lxml does wierd things to it - like adding in <p> tags and stuff)
divnode = fromstring(u'<div>' + html + u'</div>')
self(divnode)
 
# Strip all class attributes
etree.strip_attributes(divnode, 'class')
 
for style in divnode.xpath("//@style"):
parent = style.getparent()
try:
cssStyle = cssutils.parseStyle(style)
except Exception, e:
logging.info("Style %s failed to parse with error %s." % (style, e))
parent.attrib.pop('style', None)
continue
 
# Set the line separator so that the style gets serialized
cssutils.ser.prefs.lineSeparator = ''
# Only allow valid style properties
cssutils.ser.prefs.validOnly = True
 
new_style = cssStyle.cssText
if not new_style.strip():
parent.attrib.pop('style', None)
else:
parent.attrib['style'] = new_style
 
# Drop all empty span tags
for span_tag in divnode.xpath("//span"):
if not span_tag.keys():
span_tag.drop_tag()
 
#Now unwrap the divnode (i.e. just serialize the children of our extra div node)
cleaned = saxutils.escape(divnode.text) if divnode.text else ''
 
for n in divnode:
cleaned += tostring(n, encoding = unicode, method = 'xml')
return cleaned
 
# We need safe_attrs_only set to False, otherwise it strips out style attributes completely
cleaner = Cleaner(safe_attrs_only=False)
clean_html = cleaner.clean_html

Really useful.

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.