alexstorer/gist:3251327

## gistfile1.py
from lxml import etree
import csv
import urllib2
import urllib
import re

def downloadallarticles(tree,year):
    articles = tree.xpath('.//*[@id="middle2level"]/table[2]//a')
    destpath = '/Users/astorer/Work/sgrossman/output/'
    for a in articles:
        print a.get('href')
        idnum = re.findall('id=(\d+)',a.get('href'))
        if idnum:
            fname = year+'_'+str(idnum[0])+'.html'
            urllib.urlretrieve ('http://www.anc.org.za/'+a.get('href'),destpath+fname)


f = open('/tmp/files.csv','w')
entries = ["Day","Month","Year","Title","Remote","Local"]
c = csv.DictWriter(f,entries)

urlprefix = 'http://www.anc.org.za/'

baseurl = 'http://www.anc.org.za/list.php?t=Press%20Statements'
r = urllib.urlopen(baseurl)

parser = etree.HTMLParser()
tree   = etree.parse(r, parser)

toclinks = tree.xpath('//*[@id="listnav"]/a')

for l in toclinks:
    fulllink = urlprefix+l.get('href')
    print l.text, fulllink
    thisyear = l.text
    # download the link
    r = urllib.urlopen(fulllink)
    # parse the link
    subtree   = etree.parse(r, parser)
    # how many stories total are there? => how many pages?
    contents = etree.tostring(subtree)
    totalarticles = int(re.findall('Record \d+ to \d+ of (\d+)',contents)[0])
    # is there a next page?
    remainingpages = totalarticles/25+1
    for i in range(remainingpages):
        nextpageurl = 'http://www.anc.org.za/list.php?pageNum_rs=' + str(i) + '&totalRows_rs=' + str(totalarticles) + '&t=Press%20Statements&y=' + str(thisyear)
        print nextpageurl
        r = urllib.urlopen(nextpageurl)
        downloadallarticles(etree.parse(r, parser),thisyear)
    # download the stories on the page [maybe cleaning them also?]
	from lxml import etree
	import csv
	import urllib2
	import urllib
	import re

	def downloadallarticles(tree,year):
	articles = tree.xpath('.//*[@id="middle2level"]/table[2]//a')
	destpath = '/Users/astorer/Work/sgrossman/output/'
	for a in articles:
	print a.get('href')
	idnum = re.findall('id=(\d+)',a.get('href'))
	if idnum:
	fname = year+'_'+str(idnum[0])+'.html'
	urllib.urlretrieve ('http://www.anc.org.za/'+a.get('href'),destpath+fname)


	f = open('/tmp/files.csv','w')
	entries = ["Day","Month","Year","Title","Remote","Local"]
	c = csv.DictWriter(f,entries)

	urlprefix = 'http://www.anc.org.za/'

	baseurl = 'http://www.anc.org.za/list.php?t=Press%20Statements'
	r = urllib.urlopen(baseurl)

	parser = etree.HTMLParser()
	tree = etree.parse(r, parser)

	toclinks = tree.xpath('//*[@id="listnav"]/a')

	for l in toclinks:
	fulllink = urlprefix+l.get('href')
	print l.text, fulllink
	thisyear = l.text
	# download the link
	r = urllib.urlopen(fulllink)
	# parse the link
	subtree = etree.parse(r, parser)
	# how many stories total are there? => how many pages?
	contents = etree.tostring(subtree)
	totalarticles = int(re.findall('Record \d+ to \d+ of (\d+)',contents)[0])
	# is there a next page?
	remainingpages = totalarticles/25+1
	for i in range(remainingpages):
	nextpageurl = 'http://www.anc.org.za/list.php?pageNum_rs=' + str(i) + '&totalRows_rs=' + str(totalarticles) + '&t=Press%20Statements&y=' + str(thisyear)
	print nextpageurl
	r = urllib.urlopen(nextpageurl)
	downloadallarticles(etree.parse(r, parser),thisyear)
	# download the stories on the page [maybe cleaning them also?]