GuoJing/fui.py

## fui.py

from BeautifulSoup import BeautifulSoup


VALID_TAGS = [
    'div',
    'span',
    'a',
    'p',
    'br',
    'img',
    'center',
    'b',
    'strong',
    'em',
    'i',
    'ol',
    'ul',
    'li',
    'dl',
    'dt',
    'dd',
    'table',
    'thead',
    'td',
    'tr',
    'th',
    'tbody',
    'tfooter',
]

VALID_ATTRS = dict(
    a=['href', 'target', 'title'],
    img=['src', 'title']
)

VALID_ATTR_VALUE = dict(
    a=dict(target='_blank')
)


def sanitize_html(value):

    soup = BeautifulSoup(value)

    for tag in soup.findAll(True):
        # filt tag name
        if tag.name not in VALID_TAGS:
            tag.hidden = True
            continue

        # filt if no contents between tag
        if not tag.contents:
            tag.hidden = True
            continue

        # filt tag attrs
        if tag.attrs:
            _vattr = VALID_ATTR_VALUE.get(tag.name, [])
            _attrs = []

            # for loop tag.attrs
            for att in tag.attrs:
                att_name, att_value = att
                # if attr is valid
                if att_name in VALID_ATTRS.get(tag.name, []):
                    # if we need to set default value
                    # then ignore the attr
                    if _vattr and _vattr.get(att_name):
                        continue
                    _attrs.append((att_name, att_value))
            tag.attrs = _attrs

        # add default attr value to tag
        # if tag not have sucn a attr
        if tag.name in VALID_ATTR_VALUE.keys():
            _attrs = VALID_ATTR_VALUE.get(tag.name, [])
            for k, v in _attrs.items():
                tag.attrs.append((k, v))

    return soup.renderContents()

if __name__ == '__main__':
    text = 'hello world'
    assert sanitize_html(text) == 'hello world'

    text = 'some valid tag <b>strong</b> <i>here</i>'
    assert sanitize_html(text) == text

    text = 'some invalid tag <input name="input"/>'
    assert sanitize_html(text) == 'some invalid tag '

    text = 'empty tag will be removed <b></b>'
    assert sanitize_html(text) == 'empty tag will be removed '

    text = '<p color="red">This is a content with attrs.</p>'
    assert sanitize_html(text) == '<p>This is a content with attrs.</p>'

    text = '<a href="someurl">This is a link with no target.</a>'
    assert sanitize_html(text) == ('<a href="someurl" target="_blank">'
                                   'This is a link with no target.</a>')

    text = ('<a href="someurl" target="">'
            'This is a link with invalid target.</a>')
    assert sanitize_html(text) == ('<a href="someurl" target="_blank">'
                                   'This is a link with invalid target.</a>')

    text = ('<a href="someurl" target="_blank">'
            'This is a link with valid target.</a>')
    assert sanitize_html(text) == ('<a href="someurl" target="_blank">'
                                   'This is a link with valid target.</a>')

    print('validated!')

	from BeautifulSoup import BeautifulSoup


	VALID_TAGS = [
	'div',
	'span',
	'a',
	'p',
	'br',
	'img',
	'center',
	'b',
	'strong',
	'em',
	'i',
	'ol',
	'ul',
	'li',
	'dl',
	'dt',
	'dd',
	'table',
	'thead',
	'td',
	'tr',
	'th',
	'tbody',
	'tfooter',
	]

	VALID_ATTRS = dict(
	a=['href', 'target', 'title'],
	img=['src', 'title']
	)

	VALID_ATTR_VALUE = dict(
	a=dict(target='_blank')
	)


	def sanitize_html(value):

	soup = BeautifulSoup(value)

	for tag in soup.findAll(True):
	# filt tag name
	if tag.name not in VALID_TAGS:
	tag.hidden = True
	continue

	# filt if no contents between tag
	if not tag.contents:
	tag.hidden = True
	continue

	# filt tag attrs
	if tag.attrs:
	_vattr = VALID_ATTR_VALUE.get(tag.name, [])
	_attrs = []

	# for loop tag.attrs
	for att in tag.attrs:
	att_name, att_value = att
	# if attr is valid
	if att_name in VALID_ATTRS.get(tag.name, []):
	# if we need to set default value
	# then ignore the attr
	if _vattr and _vattr.get(att_name):
	continue
	_attrs.append((att_name, att_value))
	tag.attrs = _attrs

	# add default attr value to tag
	# if tag not have sucn a attr
	if tag.name in VALID_ATTR_VALUE.keys():
	_attrs = VALID_ATTR_VALUE.get(tag.name, [])
	for k, v in _attrs.items():
	tag.attrs.append((k, v))

	return soup.renderContents()

	if __name__ == '__main__':
	text = 'hello world'
	assert sanitize_html(text) == 'hello world'

	text = 'some valid tag <b>strong</b> <i>here</i>'
	assert sanitize_html(text) == text

	text = 'some invalid tag <input name="input"/>'
	assert sanitize_html(text) == 'some invalid tag '

	text = 'empty tag will be removed <b></b>'
	assert sanitize_html(text) == 'empty tag will be removed '

	text = '<p color="red">This is a content with attrs.</p>'
	assert sanitize_html(text) == '<p>This is a content with attrs.</p>'

	text = '<a href="someurl">This is a link with no target.</a>'
	assert sanitize_html(text) == ('<a href="someurl" target="_blank">'
	'This is a link with no target.</a>')

	text = ('<a href="someurl" target="">'
	'This is a link with invalid target.</a>')
	assert sanitize_html(text) == ('<a href="someurl" target="_blank">'
	'This is a link with invalid target.</a>')

	text = ('<a href="someurl" target="_blank">'
	'This is a link with valid target.</a>')
	assert sanitize_html(text) == ('<a href="someurl" target="_blank">'
	'This is a link with valid target.</a>')

	print('validated!')