Skip to content

Instantly share code, notes, and snippets.

@archatas
Created May 7, 2023 15:37
Show Gist options
  • Save archatas/b8320a9230ec5d28633488964b5f8cfa to your computer and use it in GitHub Desktop.
Save archatas/b8320a9230ec5d28633488964b5f8cfa to your computer and use it in GitHub Desktop.
Converting HTML to plain text
import re
from html.parser import HTMLParser
class HTMLStripper(HTMLParser):
# stackoverflow.com/questions/753052/strip-html-from-strings-in-python
def __init__(self):
super().__init__()
self.reset()
self.strict = False
self.convert_charrefs = True
self.text = []
def handle_starttag(self, tag: str, attrs):
if tag.lower() == "br":
self.text.append("\n")
def handle_endtag(self, tag):
if tag.lower() in ["p", "ul", "h1", "h2", "h3", "h4", "h5", "h6"]:
self.text.append("\n\n")
def handle_data(self, data):
# Replace any consecutive whitespace characters with just one space
data = re.sub("\s+", " ", data)
self.text.append(data)
def get_data(self):
return "".join(self.text).strip()
def strip_html_tags(html):
"""
Strip tags from a text that contains HTML tags and return a text.
"""
s = HTMLStripper()
s.feed(html)
return s.get_data()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment