Skip to content

Instantly share code, notes, and snippets.

@lucemia
Created October 17, 2013 05:19
Show Gist options
  • Save lucemia/7019481 to your computer and use it in GitHub Desktop.
Save lucemia/7019481 to your computer and use it in GitHub Desktop.
比 regular expression 更有效濾掉 html tag 的方式
from lxml.html import parse
from lxml import etree
import cStringIO
def remove_tags(html, strip_tags = ["script"]):
b = cStringIO.StringIO(html)
root = parse(b).getroot()
for tag in strip_tags:
for element in root.iter(tag):
element.drop_tree()
return etree.tostring(root)
if __name__ == "__main__":
test_body = """
<head>
<title>javascript - how to remove text between &lt;script&gt; and &lt;/script&gt; using python? - Stack Overflow</title>
<link rel="shortcut icon" href="//cdn.sstatic.net/stackoverflow/img/favicon.ico">
<link rel="apple-touch-icon image_src" href="//cdn.sstatic.net/stackoverflow/img/apple-touch-icon.png">
<link rel="search" type="application/opensearchdescription+xml" title="Stack Overflow" href="/opensearch.xml">
<meta name="twitter:card" content="summary">
<meta name="twitter:domain" content="stackoverflow.com"/>
<meta name="og:type" content="website" />
<meta name="og:image" content="http://cdn.sstatic.net/stackoverflow/img/apple-touch-icon@2.png?v=fde65a5a78c6"/>
<meta name="og:title" content="how to remove text between &lt;script&gt; and &lt;/script&gt; using python?" />
<meta name="og:description" content="If you&#39;re removing everything between &amp;lt;script&amp;gt; and &amp;lt;/script&amp;gt; why not just remove the entire node?
Are you expecting a resig-style src and body?
" />
<meta name="og:url" content="http://stackoverflow.com/questions/964459/how-to-remove-text-between-script-and-script-using-python/964487"/>
<script type="text/javascript" src="//ajax.googleapis.com/ajax/libs/jquery/1.7.1/jquery.min.js"></script>
<script src="//cdn.sstatic.net/Js/stub.en.js?v=3a1a88fb5ba5" type="text/javascript"></script>
<link rel="stylesheet" type="text/css" href="//cdn.sstatic.net/stackoverflow/all.css?v=eee059983a6e">
<link rel="alternate" type="application/atom+xml" title="Feed for question &#39;how to remove text between &lt;script&gt; and &lt;/script&gt; using python?&#39;" href="/feeds/question/964459">
<script type="text/javascript">
StackExchange.ready(function () {
StackExchange.using("postValidation", function () {
StackExchange.postValidation.initOnBlurAndSubmit($('#post-form'), 2, 'answer');
});
StackExchange.question.init({canOpenBounty:true,votesCast:[{"PostId":964459,"VoteTypeId":2},{"PostId":964459,"VoteTypeId":5},{"PostId":964485,"VoteTypeId":2},{"PostId":965236,"VoteTypeId":2}],canViewVoteCounts:true,totalCommentCount:0,shownCommentCount:0,highlightColor:'#F4A83D',backgroundColor:'#FFF',questionId:964459});
styleCode();
StackExchange.realtime.subscribeToQuestion('1', '964459');
});
</script>
<script type="text/javascript">
StackExchange.ready(function () {
StackExchange.realtime.init('ws://sockets.ny.stackexchange.com');
StackExchange.realtime.subscribeToInboxNotifications();
StackExchange.realtime.subscribeToReputationNotifications('1');
});
</script>
<script type="text/javascript">
StackExchange.init({"locale":"en","stackAuthUrl":"https://stackauth.com","serverTime":1381937438,"styleCode":true,"enableUserHovercards":true,"site":{"name":"Stack Overflow","description":"Q&A for professional and enthusiast programmers","isNoticesTabEnabled":true,"recaptchaPublicKey":"6LdchgIAAAAAAJwGpIzRQSOFaO0pU6s44Xt8aTwc","enableSocialMediaInSharePopup":true},"user":{"fkey":"d1835a982835720138953e5727236438","isRegistered":true,"userType":3,"userId":482169,"accountId":223139,"gravatar":"<div class=\"\"><img src=\"https://www.gravatar.com/avatar/5bc4178afdb6745063547b26ceac5742?s=32&d=identicon&r=PG\" alt=\"\" width=\"32\" height=\"32\"></div>","profileUrl":"http://stackoverflow.com/users/482169/lucemia","notificationsUnviewedCount":0,"inboxUnviewedCount":0}});
StackExchange.using.setCacheBreakers({"js/prettify-full.en.js":"e0bbd4760e83","js/moderator.en.js":"1a411fd265fe","js/full-anon.en.js":"d2d6bfa6e0f2","js/full.en.js":"70679ae5f38a","js/wmd.en.js":"3b1fa237ee0a","js/third-party/jquery.autocomplete.min.js":"e5f01e97f7c3","js/third-party/jquery.autocomplete.min.en.js":"","js/mobile.en.js":"98eaf3ec452d","js/help.en.js":"8891d63dccb3","js/tageditor.en.js":"f18986485d8f","js/tageditornew.en.js":"172e95af9b71","js/inline-tag-editing.en.js":"499a39b24fd2","js/revisions.en.js":"1dead817b481","js/review.en.js":"53b28a79980b","js/tagsuggestions.en.js":"a7d0f3ff530a","js/post-validation.en.js":"46508df60bf9","js/explore-qlist.en.js":"73825bd006fc","js/events.en.js":"130d4e07b47b"});
</script>
</head>
"""
print remove_tags(test_body)
print remove_tags(test_body, ["meta", "script"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment