xsleonard/HTML sanitization

## HTML sanitization
import re
import copy
import BeautifulSoup


class HTMLSanitizer(object):

    # replace hexadecimal character reference by decimal one
    _hexentity_massage = (copy.copy(BeautifulSoup.MARKUP_MASSAGE) +
                          [(re.compile('&#x([^;]+);'),
                            lambda m: '&#{0};'.format(int(m.group(1), 16)))])

    @classmethod
    def sanitize(cls, text):
        # parse file as html, with html entity decoding support
        soup = BeautifulSoup(text, convertEntities=BeautifulSoup.HTML_ENTITIES,
                             markupMassage=cls._hexentity_massage)
        # remove all <!-- -->
        cls.strip_comments(soup)
        # remove everything inside these tags
        ignore_tags = ('script', 'style',)
        cls.remove_elements(soup, tags=ignore_tags)
        # remove the <tags></tags>
        text = cls.remove_tags(soup)
        # 'nbsp;' -> u'\xa0' -> ' '
        text = text.replace(u'\xa0', ' ')
        return text

    @classmethod
    def strip_comments(cls, soup):
        comments = soup.findAll(text=lambda t: isinstance(t, Comment))
        [comment.extract() for comment in comments]

    @classmethod
    def remove_elements(cls, soup, tags):
        [[el.extract() for el in soup.findAll(tag)] for tag in tags]

    @classmethod
    def remove_tags(cls, soup):
        return ' '.join(soup.findAll(text=True))

# example:
# output = HTMLSanitizer.sanitize(input)
	import re
	import copy
	import BeautifulSoup


	class HTMLSanitizer(object):

	# replace hexadecimal character reference by decimal one
	_hexentity_massage = (copy.copy(BeautifulSoup.MARKUP_MASSAGE) +
	[(re.compile('&#x([^;]+);'),
	lambda m: '&#{0};'.format(int(m.group(1), 16)))])

	@classmethod
	def sanitize(cls, text):
	# parse file as html, with html entity decoding support
	soup = BeautifulSoup(text, convertEntities=BeautifulSoup.HTML_ENTITIES,
	markupMassage=cls._hexentity_massage)
	# remove all <!-- -->
	cls.strip_comments(soup)
	# remove everything inside these tags
	ignore_tags = ('script', 'style',)
	cls.remove_elements(soup, tags=ignore_tags)
	# remove the <tags></tags>
	text = cls.remove_tags(soup)
	# 'nbsp;' -> u'\xa0' -> ' '
	text = text.replace(u'\xa0', ' ')
	return text

	@classmethod
	def strip_comments(cls, soup):
	comments = soup.findAll(text=lambda t: isinstance(t, Comment))
	[comment.extract() for comment in comments]

	@classmethod
	def remove_elements(cls, soup, tags):
	[[el.extract() for el in soup.findAll(tag)] for tag in tags]

	@classmethod
	def remove_tags(cls, soup):
	return ' '.join(soup.findAll(text=True))

	# example:
	# output = HTMLSanitizer.sanitize(input)