Skip to content

Instantly share code, notes, and snippets.

@alexstorer
Created August 3, 2012 20:41
Show Gist options
  • Save alexstorer/3251327 to your computer and use it in GitHub Desktop.
Save alexstorer/3251327 to your computer and use it in GitHub Desktop.
Scraper for ANC Press Releases
from lxml import etree
import csv
import urllib2
import urllib
import re
def downloadallarticles(tree,year):
articles = tree.xpath('.//*[@id="middle2level"]/table[2]//a')
destpath = '/Users/astorer/Work/sgrossman/output/'
for a in articles:
print a.get('href')
idnum = re.findall('id=(\d+)',a.get('href'))
if idnum:
fname = year+'_'+str(idnum[0])+'.html'
urllib.urlretrieve ('http://www.anc.org.za/'+a.get('href'),destpath+fname)
f = open('/tmp/files.csv','w')
entries = ["Day","Month","Year","Title","Remote","Local"]
c = csv.DictWriter(f,entries)
urlprefix = 'http://www.anc.org.za/'
baseurl = 'http://www.anc.org.za/list.php?t=Press%20Statements'
r = urllib.urlopen(baseurl)
parser = etree.HTMLParser()
tree = etree.parse(r, parser)
toclinks = tree.xpath('//*[@id="listnav"]/a')
for l in toclinks:
fulllink = urlprefix+l.get('href')
print l.text, fulllink
thisyear = l.text
# download the link
r = urllib.urlopen(fulllink)
# parse the link
subtree = etree.parse(r, parser)
# how many stories total are there? => how many pages?
contents = etree.tostring(subtree)
totalarticles = int(re.findall('Record \d+ to \d+ of (\d+)',contents)[0])
# is there a next page?
remainingpages = totalarticles/25+1
for i in range(remainingpages):
nextpageurl = 'http://www.anc.org.za/list.php?pageNum_rs=' + str(i) + '&totalRows_rs=' + str(totalarticles) + '&t=Press%20Statements&y=' + str(thisyear)
print nextpageurl
r = urllib.urlopen(nextpageurl)
downloadallarticles(etree.parse(r, parser),thisyear)
# download the stories on the page [maybe cleaning them also?]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment