abs51295/html_to_text.py

## html_to_text.py
from bs4 import BeautifulSoup, NavigableString, Tag
import urllib.request

def html_to_text(html):
    "Creates a formatted text email message as a string from a rendered html template (page)"
    soup = BeautifulSoup(html, 'html.parser')
    # Ignore anything in head
    body, text = soup.body, []
    for element in body.descendants:
        # We use type and not isinstance since comments, cdata, etc are subclasses that we don't want
        if type(element) == NavigableString:
            parent_tags = (t for t in element.parents if type(t) == Tag)
            hidden = False
            for parent_tag in parent_tags:
                # Ignore any text inside a non-displayed tag
                # We also behave is if scripting is enabled (noscript is ignored)
                # The list of non-displayed tags and attributes from the W3C specs:
                if (parent_tag.name in ('area', 'base', 'basefont', 'datalist', 'head', 'link',
                                        'meta', 'noembed', 'noframes', 'param', 'rp', 'script',
                                        'source', 'style', 'template', 'track', 'title', 'noscript') or
                    parent_tag.has_attr('hidden') or
                    (parent_tag.name == 'input' and parent_tag.get('type') == 'hidden')):
                    hidden = True
                    break
            if hidden:
                continue

            # remove any multiple and leading/trailing whitespace
            string = ' '.join(element.string.split())
            if string:
                if element.parent.name == 'a':
                    a_tag = element.parent
                    # replace link text with the link
                    string = a_tag['href']
                    # concatenate with any non-empty immediately previous string
                    if (    type(a_tag.previous_sibling) == NavigableString and
                            a_tag.previous_sibling.string.strip() ):
                        text[-1] = text[-1] + ' ' + string
                        continue
                elif element.previous_sibling and element.previous_sibling.name == 'a':
                    text[-1] = text[-1] + ' ' + string
                    continue
                elif element.parent.name == 'p':
                    # Add extra paragraph formatting newline
                    string = '\n' + string
                text += [string]
    doc = '\n'.join(text)
    return doc

if __name__ == '__main__':
    html = urllib.request.urlopen('http://www.nytimes.com/2009/12/21/us/21storm.html')
    print(html_to_text(html))
	from bs4 import BeautifulSoup, NavigableString, Tag
	import urllib.request

	def html_to_text(html):
	"Creates a formatted text email message as a string from a rendered html template (page)"
	soup = BeautifulSoup(html, 'html.parser')
	# Ignore anything in head
	body, text = soup.body, []
	for element in body.descendants:
	# We use type and not isinstance since comments, cdata, etc are subclasses that we don't want
	if type(element) == NavigableString:
	parent_tags = (t for t in element.parents if type(t) == Tag)
	hidden = False
	for parent_tag in parent_tags:
	# Ignore any text inside a non-displayed tag
	# We also behave is if scripting is enabled (noscript is ignored)
	# The list of non-displayed tags and attributes from the W3C specs:
	if (parent_tag.name in ('area', 'base', 'basefont', 'datalist', 'head', 'link',
	'meta', 'noembed', 'noframes', 'param', 'rp', 'script',
	'source', 'style', 'template', 'track', 'title', 'noscript') or
	parent_tag.has_attr('hidden') or
	(parent_tag.name == 'input' and parent_tag.get('type') == 'hidden')):
	hidden = True
	break
	if hidden:
	continue

	# remove any multiple and leading/trailing whitespace
	string = ' '.join(element.string.split())
	if string:
	if element.parent.name == 'a':
	a_tag = element.parent
	# replace link text with the link
	string = a_tag['href']
	# concatenate with any non-empty immediately previous string
	if ( type(a_tag.previous_sibling) == NavigableString and
	a_tag.previous_sibling.string.strip() ):
	text[-1] = text[-1] + ' ' + string
	continue
	elif element.previous_sibling and element.previous_sibling.name == 'a':
	text[-1] = text[-1] + ' ' + string
	continue
	elif element.parent.name == 'p':
	# Add extra paragraph formatting newline
	string = '\n' + string
	text += [string]
	doc = '\n'.join(text)
	return doc

	if __name__ == '__main__':
	html = urllib.request.urlopen('http://www.nytimes.com/2009/12/21/us/21storm.html')
	print(html_to_text(html))