Skip to content

Instantly share code, notes, and snippets.

@kageurufu
Created August 31, 2016 14:55
Show Gist options
  • Save kageurufu/47ed6c458aaee709e6de49ac5bb96c72 to your computer and use it in GitHub Desktop.
Save kageurufu/47ed6c458aaee709e6de49ac5bb96c72 to your computer and use it in GitHub Desktop.
"""
html_to_text
Parses and converts HTML to plain text with formatting to match the original intention
For example, the following HTML would be converted as follows
<html>
<head>
<style>
/* This should NOT be in the output */
</style>
<body>
<p>
This is not a quote.
But it is a long paragraph with no line breaks.
</p>
<blockquote>
<p>This is a quote</p>
<p>As well as this</p>
<blockquote>
<p>Quoteception</p>
<blockquote>
<p>Quoteception</p>
<blockquote>
<p>Quoteception</p>
</blockquote>
</blockquote>
</blockquote>
</blockquote>
<ol>
<li>Test</li>
<li>Test</li>
<li>Test</li>
</ol>
<ul>
<li>Test</li>
<li>Test</li>
<li>Test</li>
</ul>
<p>
<a href="mailto:some@domain.com">Mailto links should be included</a><br>
<a href="mailto:some@domain.com">except where it would cause duplication such as some@domain.com</a><br/>
<a href="http://comanage.com">But normal links should</a>
</p>
</body>
</html>
>>> print(html_to_text(__doc__[__doc__.find('<html>'):__doc__.find('</html>') + 7])) # doctest: +NORMALIZE_WHITESPACE
This is not a quote. But it is a long paragraph with no line breaks.
<BLANKLINE>
> This is a quote
>
> As well as this
>
> > Quoteception
> >
> > > Quoteception
> > >
> > > > Quoteception
<BLANKLINE>
0. Test
1. Test
2. Test
<BLANKLINE>
- Test
- Test
- Test
<BLANKLINE>
Mailto links should be included ( some@domain.com )
except where it would cause duplication such as some@domain.com
But normal links should ( http://comanage.com )
"""
from __future__ import unicode_literals
import sys
if sys.version_info < (3, 0, 0):
str = unicode
import re
from HTMLParser import HTMLParser
from htmlentitydefs import name2codepoint
class HTMLElement(list):
formatters = {}
@classmethod
def formats(cls, tag):
def decorator(func):
cls.formatters[tag] = func
return func
return decorator
def __init__(self, tag, attrs=None, contents=()):
self.tag = tag
self.attrs = attrs or []
super(HTMLElement, self).__init__(contents)
@staticmethod
def format_plain(element):
return ''.join(str(s) for s in element) or ''
def __repr__(self):
if not self.tag:
return ' '.join(self)
args = dict(tag=self.tag,
attrs=(' ' + ' '.join('{}="{}"'.format(k, v)
for k, v in self.attrs)
if self.attrs else ''),
body='\n'.join(repr(s) for s in self))
if self:
return "<{tag}{attrs}>\n{body}\n</{tag}>".format(**args)
return '<{tag}{attrs}/>'.format(**args)
def __str__(self):
# This is where the magic happens
return self.formatters.get(self.tag, HTMLElement.format_plain)(self)
@HTMLElement.formats('p')
def p(e):
return '\n{}\n'.format((''.join(str(s) for s in e)).strip())
@HTMLElement.formats('blockquote')
def blockquote(e):
quote = ''.join(str(s) for s in e).strip()
quote = '\n'.join('> ' + s for s in quote.split("\n"))
return '\n{}\n'.format(quote)
@HTMLElement.formats('a')
def a(e):
href = next((v for k, v in e.attrs if k == 'href'), '')
body = HTMLElement.format_plain(e).strip()
if href.startswith('mailto'):
href = href.split(":", 1)[1]
if href in body:
return body
return HTMLElement.format_plain(e) + ' ( {} )'.format(href.strip())
@HTMLElement.formats('ol')
def ol(e):
return ''.join('\n{}. {}'.format(i, s) for i, s in enumerate(u for u in e if isinstance(u, HTMLElement))) + '\n'
@HTMLElement.formats('ul')
def ul(e):
return ''.join('\n- {}'.format(s) for s in e if isinstance(s, HTMLElement)) + '\n'
@HTMLElement.formats('br')
def br(e):
return '\n'
@HTMLElement.formats('script')
@HTMLElement.formats('style')
def null(e):
return ''
@HTMLElement.formats('span')
@HTMLElement.formats(None)
def text(e):
return re.sub('\s+', ' ', ' '.join(str(s) for s in e))
class HTMLToTextParser(HTMLParser):
def __init__(self, element_class=HTMLElement):
HTMLParser.__init__(self)
self.element_class = element_class
self.root = self.element_class("doc", [])
self.stack = [self.root]
@property
def top(self):
return self.stack[-1]
def handle_starttag(self, tag, attrs):
# Handle partial open br
if tag == 'br':
return self.handle_startendtag(tag, attrs)
new_elem = self.element_class(tag, attrs)
self.top.append(new_elem)
self.stack.append(new_elem)
def handle_startendtag(self, tag, attrs):
self.top.append(self.element_class(tag, attrs))
def handle_endtag(self, tag):
self.stack.pop()
def handle_data(self, data):
if data.strip():
self.top.append(self.element_class(None, [], [data]))
def handle_entityref(self, name):
if name in name2codepoint:
c = unichr(name2codepoint[name])
self.top.append(c)
def handle_charref(self, name):
n = int(name[1:], 16) if name.startswith('x') else int(name)
self.top.append(unichr(n))
def get_text(self):
return str(self.root).strip()
def html_to_text(html):
parser = HTMLToTextParser(HTMLElement)
parser.feed(html)
return parser.get_text()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment