Skip to content

Instantly share code, notes, and snippets.

@z-a-f
Last active July 15, 2017 02:02
Show Gist options
  • Save z-a-f/7817a97393a19a0e2329c7d899fb7a72 to your computer and use it in GitHub Desktop.
Save z-a-f/7817a97393a19a0e2329c7d899fb7a72 to your computer and use it in GitHub Desktop.
RSS 2017 -- download all papers and presentations
#!/usr/bin/env python
from lxml import html, etree
import requests
import urllib
import re
from tqdm import tqdm
import sys, os
page = requests.get('http://www.roboticsconference.org/program/detailed/index.html')
tree = html.fromstring(page.content)
search = "javascript:void($('#group"
names = [20,35]
found = None
author = None
unsafe = [' ', '*', '.', '"', '/', '\\', '[', ']', ':', ';', '|', '=', ',']
results = {}
for action, el in etree.iterwalk(tree, events=("start", "end")):
if not found and action == 'start' and el.tag == 'a' and el.get('href')[:len(search)] == search:
found = el.get('href')[names[0]:names[1]]
# title = '_'.join(el.text.split())
title = '_'.join(re.findall(r"[\w']+", el.text))
results[found] = {
'title': title
}
continue
if found and el.tag == 'a' and action == 'start' and el.text == "Full Paper":
results[found]['paper'] = el.get('href')
if found and el.tag == 'a' and action == 'start' and el.text == "Slides":
results[found]['slides'] = el.get('href')
found = None
dirname = './RSS_assets/'
try:
os.mkdir(dirname)
except OSError:
pass
for key, asset in tqdm(results.iteritems()):
try:
os.mkdir(dirname+key)
except OSError:
pass
paper_name = dirname+key+'/'+asset['title']+'.pdf'
slides_name = dirname+key+'/'+asset['title']+asset['slides'][-4:]
if not os.path.isfile(paper_name):
try:
urllib.urlretrieve(asset['paper'], paper_name)
except IOError:
print "could not load", paper_name
if not os.path.isfile(slides_name):
try:
urllib.urlretrieve(asset['slides'], slides_name)
except IOError:
print "could not load", slides_name
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment