Skip to content

Instantly share code, notes, and snippets.

@jspri
Last active February 25, 2020 07:37
Show Gist options
  • Save jspri/af441bc7dc7353d41390a59f20f07b51 to your computer and use it in GitHub Desktop.
Save jspri/af441bc7dc7353d41390a59f20f07b51 to your computer and use it in GitHub Desktop.
Converts html to plain text in python3. Only standard libraries used.
"""
HTML <-> text conversions.
http://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
"""
from html.parser import HTMLParser
from html.entities import name2codepoint
import re
class _HTMLToText(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self._buf = []
self.hide_output = False
def handle_starttag(self, tag, attrs):
if tag in ('p', 'br') and not self.hide_output:
self._buf.append('\n')
elif tag in ('script', 'style'):
self.hide_output = True
def handle_startendtag(self, tag, attrs):
if tag == 'br':
self._buf.append('\n')
def handle_endtag(self, tag):
if tag == 'p':
self._buf.append('\n')
elif tag in ('script', 'style'):
self.hide_output = False
def handle_data(self, text):
if text and not self.hide_output:
self._buf.append(re.sub(r'\s+', ' ', text))
def handle_entityref(self, name):
if name in name2codepoint and not self.hide_output:
c = chr(name2codepoint[name])
self._buf.append(c)
def handle_charref(self, name):
if not self.hide_output:
n = int(name[1:], 16) if name.startswith('x') else int(name)
self._buf.append(chr(n))
def get_text(self):
return re.sub(r' +', ' ', ''.join(self._buf))
def html_to_text(html):
"""
Given a piece of HTML, return the plain text it contains.
This handles entities and char refs, but not javascript and stylesheets.
"""
parser = _HTMLToText()
try:
parser.feed(html)
parser.close()
except: #HTMLParseError: No good replacement?
pass
return parser.get_text()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment