Skip to content

Instantly share code, notes, and snippets.

@gnufs
Last active August 29, 2015 13:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gnufs/9420060 to your computer and use it in GitHub Desktop.
Save gnufs/9420060 to your computer and use it in GitHub Desktop.
Scrapes Patrick Cockburn's articles on Independent.co.uk and generates an RSS feed
#!/usr/bin/env python
import requests
from StringIO import StringIO
from lxml.html import parse
import datetime
import PyRSS2Gen
PAGE = 'http://www.independent.co.uk/biography/patrick-cockburn'
# XPath queries
TITLE = '//*[@id="main"]/div[3]/div[2]/div/div[%d]/div/h3/a/text()'
LINK = '//*[@id="main"]/div[3]/div[2]/div/div[%d]/div/h3/a/@href'
DATE = '//*[@id="main"]/div[3]/div[2]/div/div[%d]/div/p/text()'
DESCRIPTION = '//*[@id="main"]/div[3]/div[2]/div/div[%d]/div/div/p/text()'
def get_page():
response = requests.get(PAGE)
return StringIO(response.content)
def explore(content):
root = parse(content).getroot()
articles = []
for index in range(1, 11):
articles.append({
'title' : root.xpath(TITLE % index)[0],
'link' : root.xpath(LINK % index)[0],
'date' : root.xpath(DATE % index)[0],
'description' : root.xpath(DESCRIPTION % index)[0],
})
return articles
def make_rss(articles):
rss_items = []
for article in articles:
rss_items.append(
PyRSS2Gen.RSSItem(
title = article['title'],
link = article['link'],
description = article['description'],
guid = PyRSS2Gen.Guid(article['link']),
pubDate = article['date']
)
)
rss = PyRSS2Gen.RSS2(
title = 'Patrick Cockburn',
link = PAGE,
description = 'Latest articles written by Patrick Cockburn',
lastBuildDate = datetime.datetime.now(),
items = rss_items
)
return rss.to_xml()
def main():
content = get_page()
articles = explore(content)
print make_rss(articles)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment