kageurufu/html_to_text.py

## html_to_text.py
"""
html_to_text

Parses and converts HTML to plain text with formatting to match the original intention

For example, the following HTML would be converted as follows

<html>
<head>
  <style>
    /* This should NOT be in the output */
  </style>
<body>
<p>
  This is not a quote.

  But it is a long paragraph with no line breaks.
</p>
<blockquote>
  <p>This is a quote</p>

  <p>As well as this</p>

  <blockquote>
    <p>Quoteception</p>
    <blockquote>
      <p>Quoteception</p>
      <blockquote>
        <p>Quoteception</p>
      </blockquote>
    </blockquote>
  </blockquote>
</blockquote>
<ol>
  <li>Test</li>
  <li>Test</li>
  <li>Test</li>
</ol>
<ul>
  <li>Test</li>
  <li>Test</li>
  <li>Test</li>
</ul>
<p>
  <a href="mailto:some@domain.com">Mailto links should be included</a><br>
  <a href="mailto:some@domain.com">except where it would cause duplication such as some@domain.com</a><br/>
  <a href="http://comanage.com">But normal links should</a>
</p>
</body>
</html>


>>> print(html_to_text(__doc__[__doc__.find('<html>'):__doc__.find('</html>') + 7]))  # doctest: +NORMALIZE_WHITESPACE
This is not a quote. But it is a long paragraph with no line breaks.
<BLANKLINE>
> This is a quote
>
> As well as this
>
> > Quoteception
> >
> > > Quoteception
> > >
> > > > Quoteception
<BLANKLINE>
0. Test
1. Test
2. Test
<BLANKLINE>
- Test
- Test
- Test
<BLANKLINE>
Mailto links should be included ( some@domain.com )
except where it would cause duplication such as some@domain.com
But normal links should ( http://comanage.com )
"""

from __future__ import unicode_literals

import sys

if sys.version_info < (3, 0, 0):
    str = unicode

import re
from HTMLParser import HTMLParser
from htmlentitydefs import name2codepoint


class HTMLElement(list):
    formatters = {}

    @classmethod
    def formats(cls, tag):
        def decorator(func):
            cls.formatters[tag] = func
            return func

        return decorator

    def __init__(self, tag, attrs=None, contents=()):
        self.tag = tag
        self.attrs = attrs or []
        super(HTMLElement, self).__init__(contents)

    @staticmethod
    def format_plain(element):
        return ''.join(str(s) for s in element) or ''

    def __repr__(self):
        if not self.tag:
            return ' '.join(self)

        args = dict(tag=self.tag,
                    attrs=(' ' + ' '.join('{}="{}"'.format(k, v)
                                          for k, v in self.attrs)
                           if self.attrs else ''),
                    body='\n'.join(repr(s) for s in self))
        if self:
            return "<{tag}{attrs}>\n{body}\n</{tag}>".format(**args)
        return '<{tag}{attrs}/>'.format(**args)

    def __str__(self):
        # This is where the magic happens
        return self.formatters.get(self.tag, HTMLElement.format_plain)(self)


@HTMLElement.formats('p')
def p(e):
    return '\n{}\n'.format((''.join(str(s) for s in e)).strip())


@HTMLElement.formats('blockquote')
def blockquote(e):
    quote = ''.join(str(s) for s in e).strip()
    quote = '\n'.join('> ' + s for s in quote.split("\n"))
    return '\n{}\n'.format(quote)


@HTMLElement.formats('a')
def a(e):
    href = next((v for k, v in e.attrs if k == 'href'), '')
    body = HTMLElement.format_plain(e).strip()
    if href.startswith('mailto'):
        href = href.split(":", 1)[1]

    if href in body:
        return body
    return HTMLElement.format_plain(e) + ' ( {} )'.format(href.strip())


@HTMLElement.formats('ol')
def ol(e):
    return ''.join('\n{}. {}'.format(i, s) for i, s in enumerate(u for u in e if isinstance(u, HTMLElement))) + '\n'


@HTMLElement.formats('ul')
def ul(e):
    return ''.join('\n- {}'.format(s) for s in e if isinstance(s, HTMLElement)) + '\n'


@HTMLElement.formats('br')
def br(e):
    return '\n'


@HTMLElement.formats('script')
@HTMLElement.formats('style')
def null(e):
    return ''


@HTMLElement.formats('span')
@HTMLElement.formats(None)
def text(e):
    return re.sub('\s+', ' ', ' '.join(str(s) for s in e))


class HTMLToTextParser(HTMLParser):
    def __init__(self, element_class=HTMLElement):
        HTMLParser.__init__(self)
        self.element_class = element_class
        self.root = self.element_class("doc", [])
        self.stack = [self.root]

    @property
    def top(self):
        return self.stack[-1]

    def handle_starttag(self, tag, attrs):
        # Handle partial open br
        if tag == 'br':
            return self.handle_startendtag(tag, attrs)

        new_elem = self.element_class(tag, attrs)
        self.top.append(new_elem)
        self.stack.append(new_elem)

    def handle_startendtag(self, tag, attrs):
        self.top.append(self.element_class(tag, attrs))

    def handle_endtag(self, tag):
        self.stack.pop()

    def handle_data(self, data):
        if data.strip():
            self.top.append(self.element_class(None, [], [data]))

    def handle_entityref(self, name):
        if name in name2codepoint:
            c = unichr(name2codepoint[name])
            self.top.append(c)

    def handle_charref(self, name):
        n = int(name[1:], 16) if name.startswith('x') else int(name)
        self.top.append(unichr(n))

    def get_text(self):
        return str(self.root).strip()


def html_to_text(html):
    parser = HTMLToTextParser(HTMLElement)
    parser.feed(html)
    return parser.get_text()
	"""
	html_to_text

	Parses and converts HTML to plain text with formatting to match the original intention

	For example, the following HTML would be converted as follows

	<html>
	<head>
	<style>
	/* This should NOT be in the output */
	</style>
	<body>
	<p>
	This is not a quote.

	But it is a long paragraph with no line breaks.
	</p>
	<blockquote>
	<p>This is a quote</p>

	<p>As well as this</p>

	<blockquote>
	<p>Quoteception</p>
	<blockquote>
	<p>Quoteception</p>
	<blockquote>
	<p>Quoteception</p>
	</blockquote>
	</blockquote>
	</blockquote>
	</blockquote>
	<ol>
	<li>Test</li>
	<li>Test</li>
	<li>Test</li>
	</ol>
	<ul>
	<li>Test</li>
	<li>Test</li>
	<li>Test</li>
	</ul>
	<p>
	<a href="mailto:some@domain.com">Mailto links should be included</a><br>
	<a href="mailto:some@domain.com">except where it would cause duplication such as some@domain.com</a><br/>
	<a href="http://comanage.com">But normal links should</a>
	</p>
	</body>
	</html>


	>>> print(html_to_text(__doc__[__doc__.find('<html>'):__doc__.find('</html>') + 7])) # doctest: +NORMALIZE_WHITESPACE
	This is not a quote. But it is a long paragraph with no line breaks.
	<BLANKLINE>
	> This is a quote
	>
	> As well as this
	>
	> > Quoteception
	> >
	> > > Quoteception
	> > >
	> > > > Quoteception
	<BLANKLINE>
	0. Test
	1. Test
	2. Test
	<BLANKLINE>
	- Test
	- Test
	- Test
	<BLANKLINE>
	Mailto links should be included ( some@domain.com )
	except where it would cause duplication such as some@domain.com
	But normal links should ( http://comanage.com )
	"""

	from __future__ import unicode_literals

	import sys

	if sys.version_info < (3, 0, 0):
	str = unicode

	import re
	from HTMLParser import HTMLParser
	from htmlentitydefs import name2codepoint


	class HTMLElement(list):
	formatters = {}

	@classmethod
	def formats(cls, tag):
	def decorator(func):
	cls.formatters[tag] = func
	return func

	return decorator

	def __init__(self, tag, attrs=None, contents=()):
	self.tag = tag
	self.attrs = attrs or []
	super(HTMLElement, self).__init__(contents)

	@staticmethod
	def format_plain(element):
	return ''.join(str(s) for s in element) or ''

	def __repr__(self):
	if not self.tag:
	return ' '.join(self)

	args = dict(tag=self.tag,
	attrs=(' ' + ' '.join('{}="{}"'.format(k, v)
	for k, v in self.attrs)
	if self.attrs else ''),
	body='\n'.join(repr(s) for s in self))
	if self:
	return "<{tag}{attrs}>\n{body}\n</{tag}>".format(**args)
	return '<{tag}{attrs}/>'.format(**args)

	def __str__(self):
	# This is where the magic happens
	return self.formatters.get(self.tag, HTMLElement.format_plain)(self)


	@HTMLElement.formats('p')
	def p(e):
	return '\n{}\n'.format((''.join(str(s) for s in e)).strip())


	@HTMLElement.formats('blockquote')
	def blockquote(e):
	quote = ''.join(str(s) for s in e).strip()
	quote = '\n'.join('> ' + s for s in quote.split("\n"))
	return '\n{}\n'.format(quote)


	@HTMLElement.formats('a')
	def a(e):
	href = next((v for k, v in e.attrs if k == 'href'), '')
	body = HTMLElement.format_plain(e).strip()
	if href.startswith('mailto'):
	href = href.split(":", 1)[1]

	if href in body:
	return body
	return HTMLElement.format_plain(e) + ' ( {} )'.format(href.strip())


	@HTMLElement.formats('ol')
	def ol(e):
	return ''.join('\n{}. {}'.format(i, s) for i, s in enumerate(u for u in e if isinstance(u, HTMLElement))) + '\n'


	@HTMLElement.formats('ul')
	def ul(e):
	return ''.join('\n- {}'.format(s) for s in e if isinstance(s, HTMLElement)) + '\n'


	@HTMLElement.formats('br')
	def br(e):
	return '\n'


	@HTMLElement.formats('script')
	@HTMLElement.formats('style')
	def null(e):
	return ''


	@HTMLElement.formats('span')
	@HTMLElement.formats(None)
	def text(e):
	return re.sub('\s+', ' ', ' '.join(str(s) for s in e))


	class HTMLToTextParser(HTMLParser):
	def __init__(self, element_class=HTMLElement):
	HTMLParser.__init__(self)
	self.element_class = element_class
	self.root = self.element_class("doc", [])
	self.stack = [self.root]

	@property
	def top(self):
	return self.stack[-1]

	def handle_starttag(self, tag, attrs):
	# Handle partial open br
	if tag == 'br':
	return self.handle_startendtag(tag, attrs)

	new_elem = self.element_class(tag, attrs)
	self.top.append(new_elem)
	self.stack.append(new_elem)

	def handle_startendtag(self, tag, attrs):
	self.top.append(self.element_class(tag, attrs))

	def handle_endtag(self, tag):
	self.stack.pop()

	def handle_data(self, data):
	if data.strip():
	self.top.append(self.element_class(None, [], [data]))

	def handle_entityref(self, name):
	if name in name2codepoint:
	c = unichr(name2codepoint[name])
	self.top.append(c)

	def handle_charref(self, name):
	n = int(name[1:], 16) if name.startswith('x') else int(name)
	self.top.append(unichr(n))

	def get_text(self):
	return str(self.root).strip()


	def html_to_text(html):
	parser = HTMLToTextParser(HTMLElement)
	parser.feed(html)
	return parser.get_text()