Created
October 11, 2010 10:23
-
-
Save svetlyak40wt/620319 to your computer and use it in GitHub Desktop.
XHTML cleaner, based on Cleaner from lxml.html.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
XHTML cleaner, based on Cleaner from lxml.html. | |
""" | |
import copy | |
from lxml.html import clean | |
from lxml.html import tostring, fromstring, bytes | |
def _transform_result(typ, result): | |
"""Convert the result back into the input type. | |
""" | |
if issubclass(typ, bytes): | |
return tostring(result, encoding = 'utf-8', method = 'xml') | |
elif issubclass(typ, unicode): | |
return tostring(result, encoding = unicode, method = 'xml') | |
else: | |
return result | |
class Cleaner(clean.Cleaner): | |
def clean_html(self, html): | |
result_type = type(html) | |
if isinstance(html, basestring): | |
doc = fromstring(html) | |
else: | |
doc = copy.deepcopy(html) | |
self(doc) | |
return _transform_result(result_type, doc) | |
cleaner = Cleaner() | |
clean_html = cleaner.clean_html |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment