ZoeyYoung/gist:6078034

## gistfile1.py
import re

bad_attrs = ['width', 'height', 'style', '[-a-z]*color',
             'background[-a-z]*', 'on*']
single_quoted = "'[^']+'"
double_quoted = '"[^"]+"'
non_space = '[^ "\'>]+'
cstr = ("<"  # open
        "([^>]+) "  # prefix
        "(?:%s) *" % ('|'.join(bad_attrs),) +  # undesirable attributes
        '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) +  # value
        "([^>]*)" +  # postfix
        ">")
htmlstrip = re.compile("<"  # open
                       "([^>]+) "  # prefix
                       "(?:%s) *" % ('|'.join(bad_attrs),) +  # undesirable attributes
                       '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) +  # value
                       "([^>]*)"  # postfix
                       ">",       # end
                       re.I)


def clean_attributes(html):
    """移除HTML标签中无用的属性, 即上面的bad_attrs
    例如: <div id="main" class="content" style="font-size:18px;">content</div>
    变成: <div id="main" class="content">content</div>
    """
    while htmlstrip.search(html):
        html = htmlstrip.sub(r'<\1\2>', html)
    return html
	import re

	bad_attrs = ['width', 'height', 'style', '[-a-z]*color',
	'background[-a-z]', 'on']
	single_quoted = "'[^']+'"
	double_quoted = '"[^"]+"'
	non_space = '[^ "\'>]+'
	cstr = ("<" # open
	"([^>]+) " # prefix
	"(?:%s) *" % ('\|'.join(bad_attrs),) + # undesirable attributes
	'= *(?:%s\|%s\|%s)' % (non_space, single_quoted, double_quoted) + # value
	"([^>]*)" + # postfix
	">")
	htmlstrip = re.compile("<" # open
	"([^>]+) " # prefix
	"(?:%s) *" % ('\|'.join(bad_attrs),) + # undesirable attributes
	'= *(?:%s\|%s\|%s)' % (non_space, single_quoted, double_quoted) + # value
	"([^>]*)" # postfix
	">", # end
	re.I)


	def clean_attributes(html):
	"""移除HTML标签中无用的属性, 即上面的bad_attrs
	例如: <div id="main" class="content" style="font-size:18px;">content</div>
	变成: <div id="main" class="content">content</div>
	"""
	while htmlstrip.search(html):
	html = htmlstrip.sub(r'<\1\2>', html)
	return html