Instantly share code, notes, and snippets.

@dkohlsdorf /nips.py
Last active May 23, 2018

Embed
What would you like to do?
Download all nips papers ever
from lxml import html
import requests
import urllib.request
BASE_URL = 'https://papers.nips.cc/'
page = requests.get(BASE_URL)
tree = html.fromstring(page.content)
books = [ href.attrib['href'] for href in tree.xpath('//a') if 'book' in href.attrib['href']]
for book in books:
book_page = requests.get(BASE_URL + book)
tree = html.fromstring(book_page.content)
papers = [ href.attrib['href'] for href in tree.xpath('//a') if 'paper' in href.attrib['href']]
for paper in papers:
paper_page = requests.get(BASE_URL + paper)
tree = html.fromstring(paper_page.content)
links = [ href.attrib['href'] for href in tree.xpath('//a') if 'pdf' in href.attrib['href']]
for link in links:
local = link.split('/')[-1]
urllib.request.urlretrieve(BASE_URL + link, local)
from lxml import html
import requests
import urllib.request
BASE_URL = 'https://papers.nips.cc/'
page = requests.get(BASE_URL)
tree = html.fromstring(page.content)
books = [ href.attrib['href'] for href in tree.xpath('//a') if 'book' in href.attrib['href']]
with open('abstracts.txt', 'w') as fp:
n = 0
for book in books:
book_page = requests.get(BASE_URL + book)
tree = html.fromstring(book_page.content)
papers = [ href.attrib['href'] for href in tree.xpath('//a') if 'paper' in href.attrib['href']]
for paper in papers:
paper_page = requests.get(BASE_URL + paper)
tree = html.fromstring(paper_page.content)
title = tree.xpath('//title')[0].text
abstract = tree.xpath('//*[@class="abstract"]')[0].text
out = "{}\t{}\t{}\n".format(BASE_URL + paper, title, abstract)
fp.write(out)
print(n, out)
n += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment