Skip to content

Instantly share code, notes, and snippets.

@GuoJing
Last active June 25, 2019 13:43
Show Gist options
  • Save GuoJing/e2b37e109771fd8d5689 to your computer and use it in GitHub Desktop.
Save GuoJing/e2b37e109771fd8d5689 to your computer and use it in GitHub Desktop.
A very simple html tags and attributes filter with BeautifulSoup
from BeautifulSoup import BeautifulSoup
VALID_TAGS = [
'div',
'span',
'a',
'p',
'br',
'img',
'center',
'b',
'strong',
'em',
'i',
'ol',
'ul',
'li',
'dl',
'dt',
'dd',
'table',
'thead',
'td',
'tr',
'th',
'tbody',
'tfooter',
]
VALID_ATTRS = dict(
a=['href', 'target', 'title'],
img=['src', 'title']
)
VALID_ATTR_VALUE = dict(
a=dict(target='_blank')
)
def sanitize_html(value):
soup = BeautifulSoup(value)
for tag in soup.findAll(True):
# filt tag name
if tag.name not in VALID_TAGS:
tag.hidden = True
continue
# filt if no contents between tag
if not tag.contents:
tag.hidden = True
continue
# filt tag attrs
if tag.attrs:
_vattr = VALID_ATTR_VALUE.get(tag.name, [])
_attrs = []
# for loop tag.attrs
for att in tag.attrs:
att_name, att_value = att
# if attr is valid
if att_name in VALID_ATTRS.get(tag.name, []):
# if we need to set default value
# then ignore the attr
if _vattr and _vattr.get(att_name):
continue
_attrs.append((att_name, att_value))
tag.attrs = _attrs
# add default attr value to tag
# if tag not have sucn a attr
if tag.name in VALID_ATTR_VALUE.keys():
_attrs = VALID_ATTR_VALUE.get(tag.name, [])
for k, v in _attrs.items():
tag.attrs.append((k, v))
return soup.renderContents()
if __name__ == '__main__':
text = 'hello world'
assert sanitize_html(text) == 'hello world'
text = 'some valid tag <b>strong</b> <i>here</i>'
assert sanitize_html(text) == text
text = 'some invalid tag <input name="input"/>'
assert sanitize_html(text) == 'some invalid tag '
text = 'empty tag will be removed <b></b>'
assert sanitize_html(text) == 'empty tag will be removed '
text = '<p color="red">This is a content with attrs.</p>'
assert sanitize_html(text) == '<p>This is a content with attrs.</p>'
text = '<a href="someurl">This is a link with no target.</a>'
assert sanitize_html(text) == ('<a href="someurl" target="_blank">'
'This is a link with no target.</a>')
text = ('<a href="someurl" target="">'
'This is a link with invalid target.</a>')
assert sanitize_html(text) == ('<a href="someurl" target="_blank">'
'This is a link with invalid target.</a>')
text = ('<a href="someurl" target="_blank">'
'This is a link with valid target.</a>')
assert sanitize_html(text) == ('<a href="someurl" target="_blank">'
'This is a link with valid target.</a>')
print('validated!')
@cag
Copy link

cag commented Apr 15, 2016

Hello!

I was wondering about the license for this gist. Are we allowed to use this in our own ventures?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment