Created
May 20, 2013 16:20
-
-
Save xsleonard/5613351 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import copy | |
import BeautifulSoup | |
class HTMLSanitizer(object): | |
# replace hexadecimal character reference by decimal one | |
_hexentity_massage = (copy.copy(BeautifulSoup.MARKUP_MASSAGE) + | |
[(re.compile('&#x([^;]+);'), | |
lambda m: '&#{0};'.format(int(m.group(1), 16)))]) | |
@classmethod | |
def sanitize(cls, text): | |
# parse file as html, with html entity decoding support | |
soup = BeautifulSoup(text, convertEntities=BeautifulSoup.HTML_ENTITIES, | |
markupMassage=cls._hexentity_massage) | |
# remove all <!-- --> | |
cls.strip_comments(soup) | |
# remove everything inside these tags | |
ignore_tags = ('script', 'style',) | |
cls.remove_elements(soup, tags=ignore_tags) | |
# remove the <tags></tags> | |
text = cls.remove_tags(soup) | |
# 'nbsp;' -> u'\xa0' -> ' ' | |
text = text.replace(u'\xa0', ' ') | |
return text | |
@classmethod | |
def strip_comments(cls, soup): | |
comments = soup.findAll(text=lambda t: isinstance(t, Comment)) | |
[comment.extract() for comment in comments] | |
@classmethod | |
def remove_elements(cls, soup, tags): | |
[[el.extract() for el in soup.findAll(tag)] for tag in tags] | |
@classmethod | |
def remove_tags(cls, soup): | |
return ' '.join(soup.findAll(text=True)) | |
# example: | |
# output = HTMLSanitizer.sanitize(input) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment