Skip to content

Instantly share code, notes, and snippets.

@jmoiron
Created March 9, 2012 21:50
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jmoiron/2008908 to your computer and use it in GitHub Desktop.
Save jmoiron/2008908 to your computer and use it in GitHub Desktop.
lxml Cleaner with attribute whitelist
#!/usr/bin/env python
from lxml.html.clean import Cleaner
from lxml.html import defs
class AttrWhitelistCleaner(Cleaner):
"""An HTML Cleaner that can use an attribute whitelist. Defaults to using
the attributes that are whitelisted by default with ``safe_attrs_only``
turned on."""
def __init__(self, **kw):
self.attr_whitelist = kw.pop('attr_whitelist', set(defs.safe_attrs))
super(AttrWhitelistCleaner, self).__init__(**kw)
def __call__(self, doc):
self.safe_attrs_only = False
super(AttrWhitelistCleaner, self).__call__(doc)
if hasattr(doc, 'getroot'):
doc = doc.getroot()
whitelist = self.attr_whitelist
for el in doc.iter():
attrib = el.attrib
for aname in attrib.keys():
if aname not in whitelist:
del attrib[aname]
example_whitelist = set(defs.safe_attrs) | set(['flashvars'])
example_cleaner = AttrWhitelistCleaner(attr_whitelist=example_whitelist)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment