Skip to content

Instantly share code, notes, and snippets.

@yszou
Created December 9, 2013 11:53
Show Gist options
  • Save yszou/7871161 to your computer and use it in GitHub Desktop.
Save yszou/7871161 to your computer and use it in GitHub Desktop.
邮件内容过滤
# -*- coding: utf-8 -*-
import re
EMAIL_PATTERN = re.compile(u'[a-z0-9_\-\+\.]+@[a-z0-9_\-\.]+\.[a-z]+', re.I)
from lxml.html.clean import Cleaner
from lxml.html import defs
frozenset = set
defs.safe_attrs = frozenset([
'align', 'border', 'cellpadding', 'cellspacing',
'cols', 'colspan',
'color',
'height',
'rows', 'rowspan',
'size', 'href',
'valign', 'vspace', 'width',
'style', 'name'])
class EMLCleaner(Cleaner):
scripts = True
javascript = True
comments = True
style = False
links = True
meta = True
page_structure = True
processing_instructions = True
embedded = True
frames = True
forms = True
annoying_tags = True
remove_tags = set(['body',]) #其它要去掉的标签,但是内容会被放到父级结点
kill_tags = set(['title', 'style']) #要去掉的标签,包括内容
allow_tags = ()
remove_unknown_tags = True
safe_attrs_only = True
safe_attrs = defs.safe_attrs
add_nofollow = True
host_whitelist = ()
whitelist_tags = set(['',])
def clean_html(self, text):
text = super(EMLCleaner, self).clean_html(text)
return EMAIL_PATTERN.sub('-', text)
if __name__ == '__main__':
clean = EMLCleaner()
data = open('clean_test/only_body.html', 'r').read()
data = data.decode('utf8')
print clean.clean_html(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment