Last active
June 25, 2019 13:43
-
-
Save GuoJing/e2b37e109771fd8d5689 to your computer and use it in GitHub Desktop.
A very simple html tags and attributes filter with BeautifulSoup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from BeautifulSoup import BeautifulSoup | |
VALID_TAGS = [ | |
'div', | |
'span', | |
'a', | |
'p', | |
'br', | |
'img', | |
'center', | |
'b', | |
'strong', | |
'em', | |
'i', | |
'ol', | |
'ul', | |
'li', | |
'dl', | |
'dt', | |
'dd', | |
'table', | |
'thead', | |
'td', | |
'tr', | |
'th', | |
'tbody', | |
'tfooter', | |
] | |
VALID_ATTRS = dict( | |
a=['href', 'target', 'title'], | |
img=['src', 'title'] | |
) | |
VALID_ATTR_VALUE = dict( | |
a=dict(target='_blank') | |
) | |
def sanitize_html(value): | |
soup = BeautifulSoup(value) | |
for tag in soup.findAll(True): | |
# filt tag name | |
if tag.name not in VALID_TAGS: | |
tag.hidden = True | |
continue | |
# filt if no contents between tag | |
if not tag.contents: | |
tag.hidden = True | |
continue | |
# filt tag attrs | |
if tag.attrs: | |
_vattr = VALID_ATTR_VALUE.get(tag.name, []) | |
_attrs = [] | |
# for loop tag.attrs | |
for att in tag.attrs: | |
att_name, att_value = att | |
# if attr is valid | |
if att_name in VALID_ATTRS.get(tag.name, []): | |
# if we need to set default value | |
# then ignore the attr | |
if _vattr and _vattr.get(att_name): | |
continue | |
_attrs.append((att_name, att_value)) | |
tag.attrs = _attrs | |
# add default attr value to tag | |
# if tag not have sucn a attr | |
if tag.name in VALID_ATTR_VALUE.keys(): | |
_attrs = VALID_ATTR_VALUE.get(tag.name, []) | |
for k, v in _attrs.items(): | |
tag.attrs.append((k, v)) | |
return soup.renderContents() | |
if __name__ == '__main__': | |
text = 'hello world' | |
assert sanitize_html(text) == 'hello world' | |
text = 'some valid tag <b>strong</b> <i>here</i>' | |
assert sanitize_html(text) == text | |
text = 'some invalid tag <input name="input"/>' | |
assert sanitize_html(text) == 'some invalid tag ' | |
text = 'empty tag will be removed <b></b>' | |
assert sanitize_html(text) == 'empty tag will be removed ' | |
text = '<p color="red">This is a content with attrs.</p>' | |
assert sanitize_html(text) == '<p>This is a content with attrs.</p>' | |
text = '<a href="someurl">This is a link with no target.</a>' | |
assert sanitize_html(text) == ('<a href="someurl" target="_blank">' | |
'This is a link with no target.</a>') | |
text = ('<a href="someurl" target="">' | |
'This is a link with invalid target.</a>') | |
assert sanitize_html(text) == ('<a href="someurl" target="_blank">' | |
'This is a link with invalid target.</a>') | |
text = ('<a href="someurl" target="_blank">' | |
'This is a link with valid target.</a>') | |
assert sanitize_html(text) == ('<a href="someurl" target="_blank">' | |
'This is a link with valid target.</a>') | |
print('validated!') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello!
I was wondering about the license for this gist. Are we allowed to use this in our own ventures?