Last active
January 3, 2016 13:49
-
-
Save vishnu-prasad-r/8472141 to your computer and use it in GitHub Desktop.
Richtext HTML Sanitizer for Django.
Many developers find it necessary to allow some HTML on form fields, especially on a RichTextField like TinyMCE or CKEditor. Django's strip_tags function does a good job removing dangerous tags when you can white-list the allowed tags. But it doesn't do anything about preventing XSS attacks using event attribu…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Usage: | |
# import html_sanitizer | |
# safe_html = html_sanitizer.secure_html( raw_html_string) | |
import tidy | |
import re | |
from django.template.defaultfilters import removetags | |
#Regx to remove event handler attribute like onMouseOver | |
event_cleaner = re.compile("(<[^/].*?)(\son)([^>]*>)", re.I|re.U) | |
#Regx to remove javascript given as attribute values | |
js_cleaner = re.compile("(<[^/].*?)(javascript:)([^>]*>)",re.I|re.U) | |
#List of possibily dangeous HTML tags. | |
black_list = 'script object embed iframe frame style body title head frameset html link meta form input textarea select option optgroup button label fieldset legend noscript' | |
def secure_html( raw_html ): | |
options = dict( | |
output_xhtml=True, | |
add_xml_decl=True, | |
indent=True, | |
tidy_mark=False, | |
show_body_only=True, | |
char_encoding="utf8" | |
) | |
valid = tidy.parseString(raw_html, **options) #fix malformed HTML | |
removed = removetags(str(valid), black_list) #remove blacklisted tags | |
valid = tidy.parseString(removed, **options) #fix any malformation now occured. Works against intentionally malformed HTML aimeing XSS | |
removed = event_cleaner.sub('\\1clean\\3', str(valid)) | |
removed = js_cleaner.sub('\\1 clean\\3', removed) | |
valid = tidy.parseString(removed, **options) | |
removed = removetags(str(valid), black_list) | |
removed = removed.replace('\n', '') #tidy sometimes add newline charecters inside inline styles | |
return removed |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment