Skip to content

Instantly share code, notes, and snippets.

@tantalor
Created December 15, 2009 04:06
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save tantalor/256684 to your computer and use it in GitHub Desktop.
Save tantalor/256684 to your computer and use it in GitHub Desktop.
#! /usr/bin/python
# -*- coding: utf-8 -*-
# (The MIT License)
#
# Copyright © 2009 John Tantalo
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the ‘Software’), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED ‘AS IS’, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import html5lib
from html5lib import treebuilders, treewalkers
from html5lib.tokenizer import HTMLTokenizer
from html5lib.serializer.htmlserializer import HTMLSerializer
class StripTags(HTMLTokenizer):
def __iter__(self):
for token in super(StripTags, self).__iter__():
if token["type"] not in ["StartTag", "EndTag", "EmptyTag"]:
yield token
def strip_tags(html):
if html:
builder = treebuilders.getTreeBuilder("dom")
parser = html5lib.HTMLParser(tree=builder, tokenizer=StripTags)
tree = parser.parseFragment(html)
walker = treewalkers.getTreeWalker("dom")
stream = walker(tree)
serializer = HTMLSerializer()
return serializer.render(stream)
#! /usr/bin/python
# -*- coding: utf-8 -*-
# (The MIT License)
#
# Copyright © 2009 John Tantalo
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the ‘Software’), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED ‘AS IS’, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from strip_tags import strip_tags
import unittest
class StripTagsTest(unittest.TestCase):
def test_strip_tags(self):
"""Strip all tags."""
encoded = '<p>This</table> <script>is <foo>bad</html> html.<body>'
decoded = u'This is bad html.'
self.assertEquals(strip_tags(encoded), decoded)
if __name__ == "__main__":
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment