yszou/clean.py

## clean.py
# -*- coding: utf-8 -*-

import re
EMAIL_PATTERN = re.compile(u'[a-z0-9_\-\+\.]+@[a-z0-9_\-\.]+\.[a-z]+', re.I)
from lxml.html.clean import Cleaner
from lxml.html import defs


frozenset = set
defs.safe_attrs = frozenset([
    'align', 'border', 'cellpadding', 'cellspacing',
    'cols', 'colspan',
    'color',
    'height',
    'rows', 'rowspan',
    'size', 'href',
    'valign', 'vspace', 'width',
    'style', 'name'])


class EMLCleaner(Cleaner):
    scripts = True
    javascript = True
    comments = True
    style = False
    links = True
    meta = True
    page_structure = True
    processing_instructions = True
    embedded = True
    frames = True
    forms = True
    annoying_tags = True

    remove_tags = set(['body',]) #其它要去掉的标签,但是内容会被放到父级结点
    kill_tags = set(['title', 'style']) #要去掉的标签,包括内容
    allow_tags = ()
    remove_unknown_tags = True
    safe_attrs_only = True
    safe_attrs = defs.safe_attrs
    add_nofollow = True
    host_whitelist = ()
    whitelist_tags = set(['',])


    def clean_html(self, text):
        text = super(EMLCleaner, self).clean_html(text)
        return EMAIL_PATTERN.sub('-', text)


if __name__ == '__main__':
    clean = EMLCleaner()
    data = open('clean_test/only_body.html', 'r').read()
    data = data.decode('utf8')
    print clean.clean_html(data)
	# -- coding: utf-8 --

	import re
	EMAIL_PATTERN = re.compile(u'[a-z0-9_\-\+\.]+@[a-z0-9_\-\.]+\.[a-z]+', re.I)
	from lxml.html.clean import Cleaner
	from lxml.html import defs


	frozenset = set
	defs.safe_attrs = frozenset([
	'align', 'border', 'cellpadding', 'cellspacing',
	'cols', 'colspan',
	'color',
	'height',
	'rows', 'rowspan',
	'size', 'href',
	'valign', 'vspace', 'width',
	'style', 'name'])


	class EMLCleaner(Cleaner):
	scripts = True
	javascript = True
	comments = True
	style = False
	links = True
	meta = True
	page_structure = True
	processing_instructions = True
	embedded = True
	frames = True
	forms = True
	annoying_tags = True

	remove_tags = set(['body',]) #其它要去掉的标签,但是内容会被放到父级结点
	kill_tags = set(['title', 'style']) #要去掉的标签,包括内容
	allow_tags = ()
	remove_unknown_tags = True
	safe_attrs_only = True
	safe_attrs = defs.safe_attrs
	add_nofollow = True
	host_whitelist = ()
	whitelist_tags = set(['',])


	def clean_html(self, text):
	text = super(EMLCleaner, self).clean_html(text)
	return EMAIL_PATTERN.sub('-', text)


	if __name__ == '__main__':
	clean = EMLCleaner()
	data = open('clean_test/only_body.html', 'r').read()
	data = data.decode('utf8')
	print clean.clean_html(data)