-
-
Save havocesp/d183db6ba6863fd6c2f73a2046a96106 to your computer and use it in GitHub Desktop.
Converts html to plain text in python3. Only standard libraries used.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
HTML <-> text conversions. | |
http://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python | |
""" | |
from html.parser import HTMLParser | |
from html.entities import name2codepoint | |
import re | |
class _HTMLToText(HTMLParser): | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self._buf = [] | |
self.hide_output = False | |
def handle_starttag(self, tag, attrs): | |
if tag in ('p', 'br') and not self.hide_output: | |
self._buf.append('\n') | |
elif tag in ('script', 'style'): | |
self.hide_output = True | |
def handle_startendtag(self, tag, attrs): | |
if tag == 'br': | |
self._buf.append('\n') | |
def handle_endtag(self, tag): | |
if tag == 'p': | |
self._buf.append('\n') | |
elif tag in ('script', 'style'): | |
self.hide_output = False | |
def handle_data(self, text): | |
if text and not self.hide_output: | |
self._buf.append(re.sub(r'\s+', ' ', text)) | |
def handle_entityref(self, name): | |
if name in name2codepoint and not self.hide_output: | |
c = chr(name2codepoint[name]) | |
self._buf.append(c) | |
def handle_charref(self, name): | |
if not self.hide_output: | |
n = int(name[1:], 16) if name.startswith('x') else int(name) | |
self._buf.append(chr(n)) | |
def get_text(self): | |
return re.sub(r' +', ' ', ''.join(self._buf)) | |
def html_to_text(html): | |
""" | |
Given a piece of HTML, return the plain text it contains. | |
This handles entities and char refs, but not javascript and stylesheets. | |
""" | |
parser = _HTMLToText() | |
try: | |
parser.feed(html) | |
parser.close() | |
except: #HTMLParseError: No good replacement? | |
pass | |
return parser.get_text() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
2020-02-25 07:31:05 DEBUG Tautulli WebSocket :: Leaving thread.
2020-02-25 07:31:05 INFO Re-scheduled background task: Check for server response
2020-02-25 07:31:05 ERROR Tautulli WebSocket :: [Errno 111] Connection refused.
2020-02-25 07:31:05 INFO Tautulli WebSocket :: Opening websocket.
2020-02-25 07:31:05 DEBUG Tautulli Monitor :: Unable to read session list.
2020-02-25 07:31:05 DEBUG Tautulli Monitor :: Checking for active streams.
2020-02-25 07:31:05 WARNING Tautulli Pmsconnect :: Unable to parse XML for get_current_activity: 'NoneType' object has no attribute 'getElementsByTagName'.
2020-02-25 07:31:05 WARNING Failed to access uri endpoint /status/sessions. Is your server maybe accepting SSL connections only? HTTPConnectionPool(host='172.18.0.5', port=32400): Max retries exceeded with url: /status/sessions (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x14a808ad2910>: Failed to establish a new connection: [Errno 111] Connection refused',))
2020-02-25 07:30:05 DEBUG Tautulli WebSocket :: Leaving thread.
2020-02-25 07:30:05 INFO Re-scheduled background task: Check for server response
2020-02-25 07:30:05 ERROR Tautulli WebSocket :: [Errno 111] Connection refused.
2020-02-25 07:30:05 INFO Tautulli WebSocket :: Opening websocket.
2020-02-25 07:30:05 DEBUG Tautulli Monitor :: Unable to read session list.
2020-02-25 07:30:05 DEBUG Tautulli Monitor :: Checking for active streams.
2020-02-25 07:30:05 WARNING Tautulli Pmsconnect :: Unable to parse XML for get_current_activity: 'NoneType' object has no attribute 'getElementsByTagName'.