braveulysses/sanitize_html.py

## sanitize_html.py
def sanitize(untrusted_html, additional_tags=None):
    """Strips potentially harmful tags and attributes from HTML, but preserves
    all tags in a whitelist.

    Passing the list additional_tags will add the specified tags to the whitelist.

    The sanitizer does NOT encode reserved characters into XML entities.  It is up
    to the template code, if any, to take care of that.

    Based on the work of:
     - Tom Insam <http://jerakeen.org/blog/2008/05/sanitizing-comments-with-python/>
     - akaihola <http://www.djangosnippets.org/snippets/169/>
    """
    # Allow these tags. This can be changed to whatever you please, of course,
    # either by changing the list in code or by passing alt_whitelist.
    tag_whitelist = [
        'a', 'abbr', 'address', 'b', 'code',
        'cite', 'code', 'em', 'i', 'ins', 'kbd',
        'q', 'samp', 'small', 'strike', 'strong', 'sub',
        'sup', 'var'
    ]

    if additional_tags is not None:
        tag_whitelist.extend(additional_tags)

    # Allow only these attributes on these tags. No other tags are allowed
    # any attributes.
    attr_whitelist = {
        'a': ['href', 'title', 'hreflang'],
        'img': ['src', 'width', 'height', 'alt', 'title']
    }

    # Remove these tags, complete with contents.
    tag_blacklist = [ 'script', 'style' ]

    attributes_with_urls = [ 'href', 'src' ]

    soup = BeautifulSoup(untrusted_html)
    # Remove HTML comments
    for comment in soup.findAll(
        text=lambda text: isinstance(text, Comment)):
        comment.extract()
    # Remove unwanted tags
    for tag in soup.findAll():
        # Remove blacklisted tags and their contents.
        if tag.name.lower() in tag_blacklist:
            tag.extract()
        # Hide non-whitelisted tags.
        elif tag.name.lower() not in tag_whitelist:
            tag.hidden = True
        else:
            for attr in tag.attrs:
                # Attributes in the attr_whitelist are considered, but on
                # a per-tag basis.
                if tag.name.lower() in attr_whitelist and attr[0].lower() in attr_whitelist[ tag.name.lower() ]:
                    # Some attributes contain urls..
                    if attr[0].lower() in attributes_with_urls:
                        # .. so make sure they're nice urls
                        if not re.match(r'(https?|ftp)://', attr[1].lower()):
                            tag.attrs.remove(attr)
                else:
                    # Non-whitelisted attributes are removed entirely.
                    tag.attrs.remove(attr)
    return unicode(soup)
	def sanitize(untrusted_html, additional_tags=None):
	"""Strips potentially harmful tags and attributes from HTML, but preserves
	all tags in a whitelist.

	Passing the list additional_tags will add the specified tags to the whitelist.

	The sanitizer does NOT encode reserved characters into XML entities. It is up
	to the template code, if any, to take care of that.

	Based on the work of:
	- Tom Insam <http://jerakeen.org/blog/2008/05/sanitizing-comments-with-python/>
	- akaihola <http://www.djangosnippets.org/snippets/169/>
	"""
	# Allow these tags. This can be changed to whatever you please, of course,
	# either by changing the list in code or by passing alt_whitelist.
	tag_whitelist = [
	'a', 'abbr', 'address', 'b', 'code',
	'cite', 'code', 'em', 'i', 'ins', 'kbd',
	'q', 'samp', 'small', 'strike', 'strong', 'sub',
	'sup', 'var'
	]

	if additional_tags is not None:
	tag_whitelist.extend(additional_tags)

	# Allow only these attributes on these tags. No other tags are allowed
	# any attributes.
	attr_whitelist = {
	'a': ['href', 'title', 'hreflang'],
	'img': ['src', 'width', 'height', 'alt', 'title']
	}

	# Remove these tags, complete with contents.
	tag_blacklist = [ 'script', 'style' ]

	attributes_with_urls = [ 'href', 'src' ]

	soup = BeautifulSoup(untrusted_html)
	# Remove HTML comments
	for comment in soup.findAll(
	text=lambda text: isinstance(text, Comment)):
	comment.extract()
	# Remove unwanted tags
	for tag in soup.findAll():
	# Remove blacklisted tags and their contents.
	if tag.name.lower() in tag_blacklist:
	tag.extract()
	# Hide non-whitelisted tags.
	elif tag.name.lower() not in tag_whitelist:
	tag.hidden = True
	else:
	for attr in tag.attrs:
	# Attributes in the attr_whitelist are considered, but on
	# a per-tag basis.
	if tag.name.lower() in attr_whitelist and attr[0].lower() in attr_whitelist[ tag.name.lower() ]:
	# Some attributes contain urls..
	if attr[0].lower() in attributes_with_urls:
	# .. so make sure they're nice urls
	if not re.match(r'(https?\|ftp)://', attr[1].lower()):
	tag.attrs.remove(attr)
	else:
	# Non-whitelisted attributes are removed entirely.
	tag.attrs.remove(attr)
	return unicode(soup)