Last active
February 25, 2020 07:37
-
-
Save jspri/af441bc7dc7353d41390a59f20f07b51 to your computer and use it in GitHub Desktop.
Converts html to plain text in python3. Only standard libraries used.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
HTML <-> text conversions. | |
http://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python | |
""" | |
from html.parser import HTMLParser | |
from html.entities import name2codepoint | |
import re | |
class _HTMLToText(HTMLParser): | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self._buf = [] | |
self.hide_output = False | |
def handle_starttag(self, tag, attrs): | |
if tag in ('p', 'br') and not self.hide_output: | |
self._buf.append('\n') | |
elif tag in ('script', 'style'): | |
self.hide_output = True | |
def handle_startendtag(self, tag, attrs): | |
if tag == 'br': | |
self._buf.append('\n') | |
def handle_endtag(self, tag): | |
if tag == 'p': | |
self._buf.append('\n') | |
elif tag in ('script', 'style'): | |
self.hide_output = False | |
def handle_data(self, text): | |
if text and not self.hide_output: | |
self._buf.append(re.sub(r'\s+', ' ', text)) | |
def handle_entityref(self, name): | |
if name in name2codepoint and not self.hide_output: | |
c = chr(name2codepoint[name]) | |
self._buf.append(c) | |
def handle_charref(self, name): | |
if not self.hide_output: | |
n = int(name[1:], 16) if name.startswith('x') else int(name) | |
self._buf.append(chr(n)) | |
def get_text(self): | |
return re.sub(r' +', ' ', ''.join(self._buf)) | |
def html_to_text(html): | |
""" | |
Given a piece of HTML, return the plain text it contains. | |
This handles entities and char refs, but not javascript and stylesheets. | |
""" | |
parser = _HTMLToText() | |
try: | |
parser.feed(html) | |
parser.close() | |
except: #HTMLParseError: No good replacement? | |
pass | |
return parser.get_text() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment