archatas/html_to_text.py

## html_to_text.py
import re
from html.parser import HTMLParser


class HTMLStripper(HTMLParser):
    # stackoverflow.com/questions/753052/strip-html-from-strings-in-python
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.text = []

    def handle_starttag(self, tag: str, attrs):
        if tag.lower() == "br":
            self.text.append("\n")

    def handle_endtag(self, tag):
        if tag.lower() in ["p", "ul", "h1", "h2", "h3", "h4", "h5", "h6"]:
            self.text.append("\n\n")

    def handle_data(self, data):
        # Replace any consecutive whitespace characters with just one space
        data = re.sub("\s+", " ", data)
        self.text.append(data)

    def get_data(self):
        return "".join(self.text).strip()


def strip_html_tags(html):
    """
    Strip tags from a text that contains HTML tags and return a text.
    """
    s = HTMLStripper()
    s.feed(html)
    return s.get_data()
	import re
	from html.parser import HTMLParser


	class HTMLStripper(HTMLParser):
	# stackoverflow.com/questions/753052/strip-html-from-strings-in-python
	def __init__(self):
	super().__init__()
	self.reset()
	self.strict = False
	self.convert_charrefs = True
	self.text = []

	def handle_starttag(self, tag: str, attrs):
	if tag.lower() == "br":
	self.text.append("\n")

	def handle_endtag(self, tag):
	if tag.lower() in ["p", "ul", "h1", "h2", "h3", "h4", "h5", "h6"]:
	self.text.append("\n\n")

	def handle_data(self, data):
	# Replace any consecutive whitespace characters with just one space
	data = re.sub("\s+", " ", data)
	self.text.append(data)

	def get_data(self):
	return "".join(self.text).strip()


	def strip_html_tags(html):
	"""
	Strip tags from a text that contains HTML tags and return a text.
	"""
	s = HTMLStripper()
	s.feed(html)
	return s.get_data()