Skip to content

Instantly share code, notes, and snippets.

@havocesp
Forked from jspri/html_to_text.py
Created February 25, 2020 07:37
Show Gist options
  • Save havocesp/d183db6ba6863fd6c2f73a2046a96106 to your computer and use it in GitHub Desktop.
Save havocesp/d183db6ba6863fd6c2f73a2046a96106 to your computer and use it in GitHub Desktop.
Converts html to plain text in python3. Only standard libraries used.
"""
HTML <-> text conversions.
http://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
"""
from html.parser import HTMLParser
from html.entities import name2codepoint
import re
class _HTMLToText(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self._buf = []
self.hide_output = False
def handle_starttag(self, tag, attrs):
if tag in ('p', 'br') and not self.hide_output:
self._buf.append('\n')
elif tag in ('script', 'style'):
self.hide_output = True
def handle_startendtag(self, tag, attrs):
if tag == 'br':
self._buf.append('\n')
def handle_endtag(self, tag):
if tag == 'p':
self._buf.append('\n')
elif tag in ('script', 'style'):
self.hide_output = False
def handle_data(self, text):
if text and not self.hide_output:
self._buf.append(re.sub(r'\s+', ' ', text))
def handle_entityref(self, name):
if name in name2codepoint and not self.hide_output:
c = chr(name2codepoint[name])
self._buf.append(c)
def handle_charref(self, name):
if not self.hide_output:
n = int(name[1:], 16) if name.startswith('x') else int(name)
self._buf.append(chr(n))
def get_text(self):
return re.sub(r' +', ' ', ''.join(self._buf))
def html_to_text(html):
"""
Given a piece of HTML, return the plain text it contains.
This handles entities and char refs, but not javascript and stylesheets.
"""
parser = _HTMLToText()
try:
parser.feed(html)
parser.close()
except: #HTMLParseError: No good replacement?
pass
return parser.get_text()
@thorrrr
Copy link

thorrrr commented Feb 25, 2020

2020-02-25 07:31:05 DEBUG Tautulli WebSocket :: Leaving thread.
2020-02-25 07:31:05 INFO Re-scheduled background task: Check for server response
2020-02-25 07:31:05 ERROR Tautulli WebSocket :: [Errno 111] Connection refused.
2020-02-25 07:31:05 INFO Tautulli WebSocket :: Opening websocket.
2020-02-25 07:31:05 DEBUG Tautulli Monitor :: Unable to read session list.
2020-02-25 07:31:05 DEBUG Tautulli Monitor :: Checking for active streams.
2020-02-25 07:31:05 WARNING Tautulli Pmsconnect :: Unable to parse XML for get_current_activity: 'NoneType' object has no attribute 'getElementsByTagName'.
2020-02-25 07:31:05 WARNING Failed to access uri endpoint /status/sessions. Is your server maybe accepting SSL connections only? HTTPConnectionPool(host='172.18.0.5', port=32400): Max retries exceeded with url: /status/sessions (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x14a808ad2910>: Failed to establish a new connection: [Errno 111] Connection refused',))
2020-02-25 07:30:05 DEBUG Tautulli WebSocket :: Leaving thread.
2020-02-25 07:30:05 INFO Re-scheduled background task: Check for server response
2020-02-25 07:30:05 ERROR Tautulli WebSocket :: [Errno 111] Connection refused.
2020-02-25 07:30:05 INFO Tautulli WebSocket :: Opening websocket.
2020-02-25 07:30:05 DEBUG Tautulli Monitor :: Unable to read session list.
2020-02-25 07:30:05 DEBUG Tautulli Monitor :: Checking for active streams.
2020-02-25 07:30:05 WARNING Tautulli Pmsconnect :: Unable to parse XML for get_current_activity: 'NoneType' object has no attribute 'getElementsByTagName'.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment