|
#! /usr/bin/python |
|
# -*- coding: utf-8 -*- |
|
|
|
# (The MIT License) |
|
# |
|
# Copyright © 2009 John Tantalo |
|
# |
|
# Permission is hereby granted, free of charge, to any person obtaining a copy of |
|
# this software and associated documentation files (the ‘Software’), to deal in |
|
# the Software without restriction, including without limitation the rights to |
|
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of |
|
# the Software, and to permit persons to whom the Software is furnished to do so, |
|
# subject to the following conditions: |
|
# |
|
# The above copyright notice and this permission notice shall be included in all |
|
# copies or substantial portions of the Software. |
|
# |
|
# THE SOFTWARE IS PROVIDED ‘AS IS’, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS |
|
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR |
|
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER |
|
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
|
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
|
|
|
import html5lib |
|
from html5lib import treebuilders, treewalkers |
|
from html5lib.tokenizer import HTMLTokenizer |
|
from html5lib.serializer.htmlserializer import HTMLSerializer |
|
|
|
class StripTags(HTMLTokenizer): |
|
def __iter__(self): |
|
for token in super(StripTags, self).__iter__(): |
|
if token["type"] not in ["StartTag", "EndTag", "EmptyTag"]: |
|
yield token |
|
|
|
def strip_tags(html): |
|
if html: |
|
builder = treebuilders.getTreeBuilder("dom") |
|
parser = html5lib.HTMLParser(tree=builder, tokenizer=StripTags) |
|
tree = parser.parseFragment(html) |
|
walker = treewalkers.getTreeWalker("dom") |
|
stream = walker(tree) |
|
serializer = HTMLSerializer() |
|
return serializer.render(stream) |