Skip to content

Instantly share code, notes, and snippets.

@un1t
Created January 30, 2015 11:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save un1t/6fabf38868e5ff4c8ee5 to your computer and use it in GitHub Desktop.
Save un1t/6fabf38868e5ff4c8ee5 to your computer and use it in GitHub Desktop.
html sanitizer
# -*- coding: utf-8 -*-
import re
from html5lib import HTMLParser
from html5lib.tokenizer import HTMLTokenizer
from html5lib.sanitizer import HTMLSanitizerMixin
class WysiwygSanitizerMixin(HTMLSanitizerMixin):
allowed_elements = ['b','i','strong', 'em', 'strike', 'a']
allowed_attributes = ['href']
allowed_css_properties = []
allowed_css_keywords = []
allowed_svg_properties = []
class WysiwygSanitizer(HTMLTokenizer, WysiwygSanitizerMixin):
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
lowercaseElementName=False, lowercaseAttrName=False):
#Change case matching defaults as we only output lowercase html anyway
#This solution doesn't seem ideal...
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
lowercaseElementName, lowercaseAttrName)
def __iter__(self):
for token in HTMLTokenizer.__iter__(self):
token = self.sanitize_token(token)
if token:
yield token
def wysiwyg_sanitize(html):
parser = HTMLParser(tokenizer=WysiwygSanitizer)
return parser.parseFragment(html).toxml().decode('utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment