Skip to content

Instantly share code, notes, and snippets.

@xsleonard
Created May 20, 2013 16:20
Show Gist options
  • Save xsleonard/5613351 to your computer and use it in GitHub Desktop.
Save xsleonard/5613351 to your computer and use it in GitHub Desktop.
import re
import copy
import BeautifulSoup
class HTMLSanitizer(object):
# replace hexadecimal character reference by decimal one
_hexentity_massage = (copy.copy(BeautifulSoup.MARKUP_MASSAGE) +
[(re.compile('&#x([^;]+);'),
lambda m: '&#{0};'.format(int(m.group(1), 16)))])
@classmethod
def sanitize(cls, text):
# parse file as html, with html entity decoding support
soup = BeautifulSoup(text, convertEntities=BeautifulSoup.HTML_ENTITIES,
markupMassage=cls._hexentity_massage)
# remove all <!-- -->
cls.strip_comments(soup)
# remove everything inside these tags
ignore_tags = ('script', 'style',)
cls.remove_elements(soup, tags=ignore_tags)
# remove the <tags></tags>
text = cls.remove_tags(soup)
# 'nbsp;' -> u'\xa0' -> ' '
text = text.replace(u'\xa0', ' ')
return text
@classmethod
def strip_comments(cls, soup):
comments = soup.findAll(text=lambda t: isinstance(t, Comment))
[comment.extract() for comment in comments]
@classmethod
def remove_elements(cls, soup, tags):
[[el.extract() for el in soup.findAll(tag)] for tag in tags]
@classmethod
def remove_tags(cls, soup):
return ' '.join(soup.findAll(text=True))
# example:
# output = HTMLSanitizer.sanitize(input)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment