-
-
Save abs51295/54b06814371df32c8a66f9ba80e41f21 to your computer and use it in GitHub Desktop.
Extract text from html in python using BeautifulSoup4
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup, NavigableString, Tag | |
import urllib.request | |
def html_to_text(html): | |
"Creates a formatted text email message as a string from a rendered html template (page)" | |
soup = BeautifulSoup(html, 'html.parser') | |
# Ignore anything in head | |
body, text = soup.body, [] | |
for element in body.descendants: | |
# We use type and not isinstance since comments, cdata, etc are subclasses that we don't want | |
if type(element) == NavigableString: | |
parent_tags = (t for t in element.parents if type(t) == Tag) | |
hidden = False | |
for parent_tag in parent_tags: | |
# Ignore any text inside a non-displayed tag | |
# We also behave is if scripting is enabled (noscript is ignored) | |
# The list of non-displayed tags and attributes from the W3C specs: | |
if (parent_tag.name in ('area', 'base', 'basefont', 'datalist', 'head', 'link', | |
'meta', 'noembed', 'noframes', 'param', 'rp', 'script', | |
'source', 'style', 'template', 'track', 'title', 'noscript') or | |
parent_tag.has_attr('hidden') or | |
(parent_tag.name == 'input' and parent_tag.get('type') == 'hidden')): | |
hidden = True | |
break | |
if hidden: | |
continue | |
# remove any multiple and leading/trailing whitespace | |
string = ' '.join(element.string.split()) | |
if string: | |
if element.parent.name == 'a': | |
a_tag = element.parent | |
# replace link text with the link | |
string = a_tag['href'] | |
# concatenate with any non-empty immediately previous string | |
if ( type(a_tag.previous_sibling) == NavigableString and | |
a_tag.previous_sibling.string.strip() ): | |
text[-1] = text[-1] + ' ' + string | |
continue | |
elif element.previous_sibling and element.previous_sibling.name == 'a': | |
text[-1] = text[-1] + ' ' + string | |
continue | |
elif element.parent.name == 'p': | |
# Add extra paragraph formatting newline | |
string = '\n' + string | |
text += [string] | |
doc = '\n'.join(text) | |
return doc | |
if __name__ == '__main__': | |
html = urllib.request.urlopen('http://www.nytimes.com/2009/12/21/us/21storm.html') | |
print(html_to_text(html)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment