Skip to content

Instantly share code, notes, and snippets.

@abs51295
Forked from racitup/html_to_text.py
Last active February 6, 2018 06:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abs51295/54b06814371df32c8a66f9ba80e41f21 to your computer and use it in GitHub Desktop.
Save abs51295/54b06814371df32c8a66f9ba80e41f21 to your computer and use it in GitHub Desktop.
Extract text from html in python using BeautifulSoup4
from bs4 import BeautifulSoup, NavigableString, Tag
import urllib.request
def html_to_text(html):
"Creates a formatted text email message as a string from a rendered html template (page)"
soup = BeautifulSoup(html, 'html.parser')
# Ignore anything in head
body, text = soup.body, []
for element in body.descendants:
# We use type and not isinstance since comments, cdata, etc are subclasses that we don't want
if type(element) == NavigableString:
parent_tags = (t for t in element.parents if type(t) == Tag)
hidden = False
for parent_tag in parent_tags:
# Ignore any text inside a non-displayed tag
# We also behave is if scripting is enabled (noscript is ignored)
# The list of non-displayed tags and attributes from the W3C specs:
if (parent_tag.name in ('area', 'base', 'basefont', 'datalist', 'head', 'link',
'meta', 'noembed', 'noframes', 'param', 'rp', 'script',
'source', 'style', 'template', 'track', 'title', 'noscript') or
parent_tag.has_attr('hidden') or
(parent_tag.name == 'input' and parent_tag.get('type') == 'hidden')):
hidden = True
break
if hidden:
continue
# remove any multiple and leading/trailing whitespace
string = ' '.join(element.string.split())
if string:
if element.parent.name == 'a':
a_tag = element.parent
# replace link text with the link
string = a_tag['href']
# concatenate with any non-empty immediately previous string
if ( type(a_tag.previous_sibling) == NavigableString and
a_tag.previous_sibling.string.strip() ):
text[-1] = text[-1] + ' ' + string
continue
elif element.previous_sibling and element.previous_sibling.name == 'a':
text[-1] = text[-1] + ' ' + string
continue
elif element.parent.name == 'p':
# Add extra paragraph formatting newline
string = '\n' + string
text += [string]
doc = '\n'.join(text)
return doc
if __name__ == '__main__':
html = urllib.request.urlopen('http://www.nytimes.com/2009/12/21/us/21storm.html')
print(html_to_text(html))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment