tantalor/strip_tags.py

## strip_tags.py
#! /usr/bin/python
# -*- coding: utf-8 -*-

# (The MIT License)
#
# Copyright © 2009 John Tantalo
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the ‘Software’), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED ‘AS IS’, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

import html5lib
from html5lib import treebuilders, treewalkers
from html5lib.tokenizer import HTMLTokenizer
from html5lib.serializer.htmlserializer import HTMLSerializer

class StripTags(HTMLTokenizer):
  def __iter__(self):
    for token in super(StripTags, self).__iter__():
      if token["type"] not in ["StartTag", "EndTag", "EmptyTag"]:
        yield token

def strip_tags(html):
  if html:
    builder = treebuilders.getTreeBuilder("dom")
    parser = html5lib.HTMLParser(tree=builder, tokenizer=StripTags)
    tree = parser.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(tree)
    serializer = HTMLSerializer()
    return serializer.render(stream)

## strip_tags.test.py
#! /usr/bin/python
# -*- coding: utf-8 -*-

# (The MIT License)
#
# Copyright © 2009 John Tantalo
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the ‘Software’), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED ‘AS IS’, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

from strip_tags import strip_tags
import unittest


class StripTagsTest(unittest.TestCase):
  def test_strip_tags(self):
    """Strip all tags."""
    encoded = '<p>This</table> <script>is <foo>bad</html> html.<body>'
    decoded = u'This is bad html.'
    self.assertEquals(strip_tags(encoded), decoded)


if __name__ == "__main__":
  unittest.main()
	#! /usr/bin/python
	# -- coding: utf-8 --

	# (The MIT License)
	#
	# Copyright © 2009 John Tantalo
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy of
	# this software and associated documentation files (the ‘Software’), to deal in
	# the Software without restriction, including without limitation the rights to
	# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
	# the Software, and to permit persons to whom the Software is furnished to do so,
	# subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED ‘AS IS’, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
	# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
	# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
	# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

	import html5lib
	from html5lib import treebuilders, treewalkers
	from html5lib.tokenizer import HTMLTokenizer
	from html5lib.serializer.htmlserializer import HTMLSerializer

	class StripTags(HTMLTokenizer):
	def __iter__(self):
	for token in super(StripTags, self).__iter__():
	if token["type"] not in ["StartTag", "EndTag", "EmptyTag"]:
	yield token

	def strip_tags(html):
	if html:
	builder = treebuilders.getTreeBuilder("dom")
	parser = html5lib.HTMLParser(tree=builder, tokenizer=StripTags)
	tree = parser.parseFragment(html)
	walker = treewalkers.getTreeWalker("dom")
	stream = walker(tree)
	serializer = HTMLSerializer()
	return serializer.render(stream)