public
Created

Extracting article contents

  • Download Gist
gistfile1.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
# Requires: requests and readability-lxml (http://pypi.python.org/pypi/readability-lxml) (pip install requests readability-lxml)
 
import requests
 
def extract_article_contents(url):
"""
Fetch the main body of content
 
Returns the html, a plaintext version, the title and subtitle
"""
from readability.readability import Document
 
# TODO: Needs lots of exception handling
html = requests.get(url).text
doc = Document(html)
summary_html = doc.summary()
 
# Fairly naive: Pull out the contents of paragraph tags
paragraphs_html = re.findall('<p>(.*?)</p>', summary_html)
paragraphs = [re.sub('<(.*?)>', '', para) for para in paragraphs_html]
summary_text = '\n\n'.join(paragraphs)
 
# Hack for sites without p tags
if not summary_text:
# Replace line breaks with double new lines (paragraphs)
summary_text = re.sub('<(br.*?)>', '\n\n', summary_html)
# Remove all html
summary_text = re.sub('<(.*?)>', ' ', summary_text)
# Remove whitespace
summary_text = re.sub('\n\n\n+', '\n\n', summary_text)
summary_text = re.sub('[ \t\r\f\v][ \t\r\f\v][ \t\r\f\v]+', ' ', summary_text)
summary_text = summary_text.strip()
 
title = doc.short_title()
subtitle = doc.title().replace(title, '').strip(' |-:/')
 
 
# TODO: Return a named tuple
return summary_html, summary_text, title, subtitle
 
# Extract the article text using readability
html, text, title, subtitle = extract_article_contents(url)

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.