Skip to content

Instantly share code, notes, and snippets.

@FZambia
Created May 27, 2013 05:35
Show Gist options
  • Save FZambia/5655338 to your computer and use it in GitHub Desktop.
Save FZambia/5655338 to your computer and use it in GitHub Desktop.
lxml html sanitize and autolink
from lxml.html import clean
class Bleacher(clean.Cleaner):
safe_attrs_only = True
safe_attrs = frozenset([
'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
'type', 'usemap', 'valign', 'value', 'vspace', 'width'
])
_tag_link_attrs = dict(
iframe='src',
embed='src',
a='href'
)
def clean_html(html, host_whitelist=()):
cleaner = Bleacher(host_whitelist=host_whitelist)
cleaned_html = cleaner.clean_html('<body>' + html + '</body>')
linkified_html = clean.autolink_html(cleaned_html)
return linkified_html
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment