Skip to content

Instantly share code, notes, and snippets.

@dervn
Created March 8, 2011 02:08
Show Gist options
  • Star 98 You must be signed in to star a gist
  • Fork 33 You must be signed in to fork a gist
  • Save dervn/859717 to your computer and use it in GitHub Desktop.
Save dervn/859717 to your computer and use it in GitHub Desktop.
Python中过滤HTML标签的函数
#用正则简单过滤html的<>标签
import re
str = "<img /><a>srcd</a>hello</br><br/>"
str = re.sub(r'</?\w+[^>]*>','',str)
print str
#用了HTMLParser,有更简单的方式吗?正则?
def strip_tags(html):
"""
Python中过滤HTML标签的函数
>>> str_text=strip_tags("<font color=red>hello</font>")
>>> print str_text
hello
"""
from HTMLParser import HTMLParser
html = html.strip()
html = html.strip("\n")
result = []
parser = HTMLParser()
parser.handle_data = result.append
parser.feed(html)
parser.close()
return ''.join(result)
#更深层次的过滤,类似instapaper或者readitlater这种服务,很有意思的研究课题
# -*- coding: utf-8-*-
import re
##过滤HTML中的标签
#将HTML中标签等信息去掉
#@param htmlstr HTML字符串.
def filter_tags(htmlstr):
#先过滤CDATA
re_cdata=re.compile('//<!\[CDATA\[[^>]*//\]\]>',re.I) #匹配CDATA
re_script=re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>',re.I)#Script
re_style=re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>',re.I)#style
re_br=re.compile('<br\s*?/?>')#处理换行
re_h=re.compile('</?\w+[^>]*>')#HTML标签
re_comment=re.compile('<!--[^>]*-->')#HTML注释
s=re_cdata.sub('',htmlstr)#去掉CDATA
s=re_script.sub('',s) #去掉SCRIPT
s=re_style.sub('',s)#去掉style
s=re_br.sub('\n',s)#将br转换为换行
s=re_h.sub('',s) #去掉HTML 标签
s=re_comment.sub('',s)#去掉HTML注释
#去掉多余的空行
blank_line=re.compile('\n+')
s=blank_line.sub('\n',s)
s=replaceCharEntity(s)#替换实体
return s
##替换常用HTML字符实体.
#使用正常的字符替换HTML中特殊的字符实体.
#你可以添加新的实体字符到CHAR_ENTITIES中,处理更多HTML字符实体.
#@param htmlstr HTML字符串.
def replaceCharEntity(htmlstr):
CHAR_ENTITIES={'nbsp':' ','160':' ',
'lt':'<','60':'<',
'gt':'>','62':'>',
'amp':'&','38':'&',
'quot':'"','34':'"',}
re_charEntity=re.compile(r'&#?(?P<name>\w+);')
sz=re_charEntity.search(htmlstr)
while sz:
entity=sz.group()#entity全称,如&gt;
key=sz.group('name')#去除&;后entity,如&gt;为gt
try:
htmlstr=re_charEntity.sub(CHAR_ENTITIES[key],htmlstr,1)
sz=re_charEntity.search(htmlstr)
except KeyError:
#以空串代替
htmlstr=re_charEntity.sub('',htmlstr,1)
sz=re_charEntity.search(htmlstr)
return htmlstr
def repalce(s,re_exp,repl_string):
return re_exp.sub(repl_string,s)
if __name__=='__main__':
s=file('Google.htm').read()
news=filter_tags(s)
print news
@vickyi
Copy link

vickyi commented Jan 29, 2013

学习了

@naoyeye
Copy link

naoyeye commented Apr 13, 2013

thanks!

@YvesChan
Copy link

re.py 用于简单的标签过滤还是很简洁的!

@pyshift
Copy link

pyshift commented Apr 12, 2014

不错,一直想找去除&gt这样的。学习了。

@taizilongxu
Copy link

简洁明了

@xcrossed
Copy link

xcrossed commented Sep 5, 2016

要善于利用已经有的工具。来,一行搞定。

from pyquery import PyQuery
doc = PyQuery('<div><span>toto</span><span>tata</span></div>')
print doc.text()

'toto tata'

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment