Skip to content

Instantly share code, notes, and snippets.

@screeley
Created July 1, 2009 06:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save screeley/138642 to your computer and use it in GitHub Desktop.
Save screeley/138642 to your computer and use it in GitHub Desktop.
"""
A script for automatically extracting excerpts from articles. It requires lxml.
Usage::
from lxml_extractor import get_summary
url = "http://someurl.com/goes/here"
(title,description) = get_summary(url)
Example and discussion:
http://www.screeley.com/entries/2009/jul/01/faster-python-script-extracting-excerpts-articles/
author: Sean Creeley (http://www.screeley.com)
Original License:
==========================================
Some examples, discussion, and comparison with the Facebook article extractor
are at http://blog.davidziegler.net/post/122176962/a-python-script-to-automatically-extract-excerpts-from
copyright: Copyright 2009 by David Ziegler
license: MIT License
website: http://github.com/dziegler/excerpt_extractor/tree/master
"""
from lxml.html import fromstring
from lxml.html.clean import Cleaner
import urllib2
import cookielib
def get_summary(url):
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
try:
response = opener.open(url).read()
except urllib2.URLError:
return (None, None)
doc = fromstring(response)
cleaner = Cleaner(page_structure=False,
meta=False,
safe_attrs_only=False,
remove_tags=['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
#Remove irrelevant content from the document
doc = cleaner.clean_html(doc)
description = None
try:
description = doc.xpath('/html/head/meta[@content][@name="description"]')[0].get("content")
except IndexError:
pass
title = None
try:
title = doc.xpath('/html/head/title')[0].text_content().strip()
except IndexError:
pass
if not description:
#Get rid of the head element
doc.head.drop_tree()
p_texts = [p.strip() for p in doc.text_content().split('\n')]
description = max((len(p), p) for p in p_texts)[1].strip()[0:255]
return title, description
if __name__ == "__main__":
urllist=("http://www.sfgate.com/cgi-bin/article.cgi?f=/c/a/2009/06/04/DD7V1806SV.DTL&type=performance",
"http://www.chloeveltman.com/blog/2009/05/two-very-different-symphonies.html#links",
"http://www.chloeveltman.com/blog/2009/06/child-prodigy-at-peabody-essex-museum.html#links",
"http://www.sfgate.com/cgi-bin/article.cgi?f=/c/a/2009/06/04/NS9617O7JK.DTL&type=performance",
"http://blogs.mercurynews.com/aei/2009/06/04/ramya-auroprem-joins-cast-of-spelling-bee/",
"http://www.mercurynews.com/karendsouza/ci_12510394",
"http://www.reason.com/news/show/134059.html")
for u in urllist:
print '%s : %s\n' % get_summary(u)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment