gearmonkey/icwsm.py

## icwsm.py
#!/usr/bin/env python

from lxml import etree
from urllib import urlopen

if __name__=='__main__':

    url = 'http://www.aaai.org/ocs/index.php/ICWSM/ICWSM11/schedConf/presentations'
    tree = etree.parse(urlopen(url), etree.HTMLParser())

    papers = open('papers.tsv','w')

    for table in tree.xpath('//div[@id="content"]/table'):

        tds = table.xpath('tr//td')
        title = tds[0].find('a').text
        pdf = tds[1].find('a').get('href')
        authors = map(lambda s: s.strip(), tds[2].text.split(',\t'))

        fname = pdf.split('/')[-2] + '.pdf'

        papers.write(unicode("\t".join( [fname, title] + authors ) + '\n').encode('utf8'))
        papers.flush()

        pdf_url = 'http://www.aaai.org/ocs/index.php/ICWSM/ICWSM11/paper/viewFile/' + "/".join(pdf.split('/')[-2:])
        print "downloading" , pdf_url , "to" , fname
        f = open(fname, 'w')
        f.write(urlopen(pdf_url).read())
        f.close()

    papers.close()
	#!/usr/bin/env python

	from lxml import etree
	from urllib import urlopen

	if __name__=='__main__':

	url = 'http://www.aaai.org/ocs/index.php/ICWSM/ICWSM11/schedConf/presentations'
	tree = etree.parse(urlopen(url), etree.HTMLParser())

	papers = open('papers.tsv','w')

	for table in tree.xpath('//div[@id="content"]/table'):

	tds = table.xpath('tr//td')
	title = tds[0].find('a').text
	pdf = tds[1].find('a').get('href')
	authors = map(lambda s: s.strip(), tds[2].text.split(',\t'))

	fname = pdf.split('/')[-2] + '.pdf'

	papers.write(unicode("\t".join( [fname, title] + authors ) + '\n').encode('utf8'))
	papers.flush()

	pdf_url = 'http://www.aaai.org/ocs/index.php/ICWSM/ICWSM11/paper/viewFile/' + "/".join(pdf.split('/')[-2:])
	print "downloading" , pdf_url , "to" , fname
	f = open(fname, 'w')
	f.write(urlopen(pdf_url).read())
	f.close()

	papers.close()