Skip to content

Instantly share code, notes, and snippets.

@virusdefender
Last active August 29, 2015 14:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save virusdefender/5ec5e1be9738d87122e0 to your computer and use it in GitHub Desktop.
Save virusdefender/5ec5e1be9738d87122e0 to your computer and use it in GitHub Desktop.
#coding=utf-8
import re
from BeautifulSoup import BeautifulSoup
regex_cache = {}
def search(text, regex):
regexcmp = regex_cache.get(regex)
if not regexcmp:
regexcmp = re.compile(regex)
regex_cache[regex] = regexcmp
return regexcmp.search(text)
# XSS白名单
VALID_TAGS = {'h1':{}, 'h2':{}, 'h3':{}, 'h4':{}, 'strong':{}, 'em':{},
'p':{}, 'ul':{}, 'li':{}, 'br':{}, 'a':{'href':'^http://', 'title':'.*'},
'img':{'src':'^http://', 'alt':'.*'}}
def parsehtml(html):
soup = BeautifulSoup(html)
for tag in soup.findAll(True):
if tag.name not in VALID_TAGS:
tag.hidden = True
else:
attr_rules = VALID_TAGS[tag.name]
print tag.attrs, len(tag.attrs)
for item in tag.attrs:
#print item
attr_name = item[0]
attr_value = item[1]
#print attr_name, attr_value
#检查属性类型
if attr_name not in attr_rules:
del tag[attr_name]
print tag.attrs, "----"
#print "del", attr_name
continue
#检查属性值格式
if not search(attr_value, attr_rules[attr_name]):
del tag[attr_name]
return soup.renderContents()
text = '''
<a onmouseover=alert(document.cookie)>xxs link</a>dsd'/>
<IMG """><SCRIPT>alert("XSS")</SCRIPT>">
<IMG SRC=/ onerror="alert(String.fromCharCode(88,83,83))"></img>
'''
print parsehtml(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment