Skip to content

Instantly share code, notes, and snippets.

@billthornton
Created April 17, 2012 07:24
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save billthornton/2404165 to your computer and use it in GitHub Desktop.
Save billthornton/2404165 to your computer and use it in GitHub Desktop.
Extracting article contents
# Requires: requests and readability-lxml (http://pypi.python.org/pypi/readability-lxml) (pip install requests readability-lxml)
import requests
def extract_article_contents(url):
"""
Fetch the main body of content
Returns the html, a plaintext version, the title and subtitle
"""
from readability.readability import Document
# TODO: Needs lots of exception handling
html = requests.get(url).text
doc = Document(html)
summary_html = doc.summary()
# Fairly naive: Pull out the contents of paragraph tags
paragraphs_html = re.findall('<p>(.*?)</p>', summary_html)
paragraphs = [re.sub('<(.*?)>', '', para) for para in paragraphs_html]
summary_text = '\n\n'.join(paragraphs)
# Hack for sites without p tags
if not summary_text:
# Replace line breaks with double new lines (paragraphs)
summary_text = re.sub('<(br.*?)>', '\n\n', summary_html)
# Remove all html
summary_text = re.sub('<(.*?)>', ' ', summary_text)
# Remove whitespace
summary_text = re.sub('\n\n\n+', '\n\n', summary_text)
summary_text = re.sub('[ \t\r\f\v][ \t\r\f\v][ \t\r\f\v]+', ' ', summary_text)
summary_text = summary_text.strip()
title = doc.short_title()
subtitle = doc.title().replace(title, '').strip(' |-:/')
# TODO: Return a named tuple
return summary_html, summary_text, title, subtitle
# Extract the article text using readability
html, text, title, subtitle = extract_article_contents(url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment