Created
October 17, 2013 05:19
-
-
Save lucemia/7019481 to your computer and use it in GitHub Desktop.
比 regular expression 更有效濾掉 html tag 的方式
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml.html import parse | |
from lxml import etree | |
import cStringIO | |
def remove_tags(html, strip_tags = ["script"]): | |
b = cStringIO.StringIO(html) | |
root = parse(b).getroot() | |
for tag in strip_tags: | |
for element in root.iter(tag): | |
element.drop_tree() | |
return etree.tostring(root) | |
if __name__ == "__main__": | |
test_body = """ | |
<head> | |
<title>javascript - how to remove text between <script> and </script> using python? - Stack Overflow</title> | |
<link rel="shortcut icon" href="//cdn.sstatic.net/stackoverflow/img/favicon.ico"> | |
<link rel="apple-touch-icon image_src" href="//cdn.sstatic.net/stackoverflow/img/apple-touch-icon.png"> | |
<link rel="search" type="application/opensearchdescription+xml" title="Stack Overflow" href="/opensearch.xml"> | |
<meta name="twitter:card" content="summary"> | |
<meta name="twitter:domain" content="stackoverflow.com"/> | |
<meta name="og:type" content="website" /> | |
<meta name="og:image" content="http://cdn.sstatic.net/stackoverflow/img/apple-touch-icon@2.png?v=fde65a5a78c6"/> | |
<meta name="og:title" content="how to remove text between <script> and </script> using python?" /> | |
<meta name="og:description" content="If you're removing everything between &lt;script&gt; and &lt;/script&gt; why not just remove the entire node? | |
Are you expecting a resig-style src and body? | |
" /> | |
<meta name="og:url" content="http://stackoverflow.com/questions/964459/how-to-remove-text-between-script-and-script-using-python/964487"/> | |
<script type="text/javascript" src="//ajax.googleapis.com/ajax/libs/jquery/1.7.1/jquery.min.js"></script> | |
<script src="//cdn.sstatic.net/Js/stub.en.js?v=3a1a88fb5ba5" type="text/javascript"></script> | |
<link rel="stylesheet" type="text/css" href="//cdn.sstatic.net/stackoverflow/all.css?v=eee059983a6e"> | |
<link rel="alternate" type="application/atom+xml" title="Feed for question 'how to remove text between <script> and </script> using python?'" href="/feeds/question/964459"> | |
<script type="text/javascript"> | |
StackExchange.ready(function () { | |
StackExchange.using("postValidation", function () { | |
StackExchange.postValidation.initOnBlurAndSubmit($('#post-form'), 2, 'answer'); | |
}); | |
StackExchange.question.init({canOpenBounty:true,votesCast:[{"PostId":964459,"VoteTypeId":2},{"PostId":964459,"VoteTypeId":5},{"PostId":964485,"VoteTypeId":2},{"PostId":965236,"VoteTypeId":2}],canViewVoteCounts:true,totalCommentCount:0,shownCommentCount:0,highlightColor:'#F4A83D',backgroundColor:'#FFF',questionId:964459}); | |
styleCode(); | |
StackExchange.realtime.subscribeToQuestion('1', '964459'); | |
}); | |
</script> | |
<script type="text/javascript"> | |
StackExchange.ready(function () { | |
StackExchange.realtime.init('ws://sockets.ny.stackexchange.com'); | |
StackExchange.realtime.subscribeToInboxNotifications(); | |
StackExchange.realtime.subscribeToReputationNotifications('1'); | |
}); | |
</script> | |
<script type="text/javascript"> | |
StackExchange.init({"locale":"en","stackAuthUrl":"https://stackauth.com","serverTime":1381937438,"styleCode":true,"enableUserHovercards":true,"site":{"name":"Stack Overflow","description":"Q&A for professional and enthusiast programmers","isNoticesTabEnabled":true,"recaptchaPublicKey":"6LdchgIAAAAAAJwGpIzRQSOFaO0pU6s44Xt8aTwc","enableSocialMediaInSharePopup":true},"user":{"fkey":"d1835a982835720138953e5727236438","isRegistered":true,"userType":3,"userId":482169,"accountId":223139,"gravatar":"<div class=\"\"><img src=\"https://www.gravatar.com/avatar/5bc4178afdb6745063547b26ceac5742?s=32&d=identicon&r=PG\" alt=\"\" width=\"32\" height=\"32\"></div>","profileUrl":"http://stackoverflow.com/users/482169/lucemia","notificationsUnviewedCount":0,"inboxUnviewedCount":0}}); | |
StackExchange.using.setCacheBreakers({"js/prettify-full.en.js":"e0bbd4760e83","js/moderator.en.js":"1a411fd265fe","js/full-anon.en.js":"d2d6bfa6e0f2","js/full.en.js":"70679ae5f38a","js/wmd.en.js":"3b1fa237ee0a","js/third-party/jquery.autocomplete.min.js":"e5f01e97f7c3","js/third-party/jquery.autocomplete.min.en.js":"","js/mobile.en.js":"98eaf3ec452d","js/help.en.js":"8891d63dccb3","js/tageditor.en.js":"f18986485d8f","js/tageditornew.en.js":"172e95af9b71","js/inline-tag-editing.en.js":"499a39b24fd2","js/revisions.en.js":"1dead817b481","js/review.en.js":"53b28a79980b","js/tagsuggestions.en.js":"a7d0f3ff530a","js/post-validation.en.js":"46508df60bf9","js/explore-qlist.en.js":"73825bd006fc","js/events.en.js":"130d4e07b47b"}); | |
</script> | |
</head> | |
""" | |
print remove_tags(test_body) | |
print remove_tags(test_body, ["meta", "script"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment