Skip to content

Instantly share code, notes, and snippets.

@svetlyak40wt
Created October 11, 2010 10:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save svetlyak40wt/620319 to your computer and use it in GitHub Desktop.
Save svetlyak40wt/620319 to your computer and use it in GitHub Desktop.
XHTML cleaner, based on Cleaner from lxml.html.
"""
XHTML cleaner, based on Cleaner from lxml.html.
"""
import copy
from lxml.html import clean
from lxml.html import tostring, fromstring, bytes
def _transform_result(typ, result):
"""Convert the result back into the input type.
"""
if issubclass(typ, bytes):
return tostring(result, encoding = 'utf-8', method = 'xml')
elif issubclass(typ, unicode):
return tostring(result, encoding = unicode, method = 'xml')
else:
return result
class Cleaner(clean.Cleaner):
def clean_html(self, html):
result_type = type(html)
if isinstance(html, basestring):
doc = fromstring(html)
else:
doc = copy.deepcopy(html)
self(doc)
return _transform_result(result_type, doc)
cleaner = Cleaner()
clean_html = cleaner.clean_html
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment