Skip to content

Instantly share code, notes, and snippets.

@jhofman
Created July 14, 2011 16:13
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save jhofman/1082772 to your computer and use it in GitHub Desktop.
Save jhofman/1082772 to your computer and use it in GitHub Desktop.
script to scrape pdfs and paper info for icwsm2011
#!/usr/bin/env python
from lxml import etree
from urllib import urlopen
if __name__=='__main__':
url = 'http://www.aaai.org/ocs/index.php/ICWSM/ICWSM11/schedConf/presentations'
tree = etree.parse(urlopen(url), etree.HTMLParser())
papers = open('papers.tsv','w')
for table in tree.xpath('//div[@id="content"]/table'):
tds = table.xpath('tr//td')
title = tds[0].find('a').text
pdf = tds[1].find('a').get('href')
authors = map(lambda s: s.strip(), tds[2].text.split(',\t'))
fname = pdf.split('/')[-2] + '.pdf'
papers.write( "\t".join( [fname, title] + authors ) + '\n' )
pdf_url = 'http://www.aaai.org/ocs/index.php/ICWSM/ICWSM11/paper/viewFile/' + "/".join(pdf.split('/')[-2:])
print "downloading" , pdf_url , "to" , fname
f = open(fname, 'w')
f.write(urlopen(pdf_url).read())
f.close()
papers.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment