Skip to content

Instantly share code, notes, and snippets.

@agarie
Last active December 12, 2017 02:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save agarie/a417613c092bf11f32fb2e48d5aa8e59 to your computer and use it in GitHub Desktop.
Save agarie/a417613c092bf11f32fb2e48d5aa8e59 to your computer and use it in GitHub Desktop.
Quick script I made to download papers from NIPS. The subjects are specified in `SUBJECTS_RE`.
require 'fileutils'
require 'nokogiri'
require 'open-uri'
require 'pp'
require 'typhoeus'
SUBJECTS_RE = /deep|deeply|neural|convolutional|network|recurrent|lstm|object recognition|object classification|object detection|image classification/
def paper_list_url(issue)
"https://papers.nips.cc/book/advances-in-neural-information-processing-systems-#{issue}-#{1987 + issue}"
end
def abstract_eligible?(response_body)
abstract = Nokogiri::HTML(response_body).css("p.abstract").text
abstract.downcase =~ SUBJECTS_RE
end
def download_to_file(content, filename)
puts "Downloading: #{filename}"
File.open(filename, "wb") do |f|
f.write(content)
end
end
def download_articles(issue)
base_url = paper_list_url(issue)
nips_dir = "NIPS-#{issue}"
FileUtils.mkdir(nips_dir) unless Dir.exists? nips_dir
hydra = Typhoeus::Hydra.new(max_concurrency: 4)
open(base_url) do |f|
papers = Nokogiri::HTML(f).css("a[href]").select { |e| e['href'] =~ /^\/paper\// }
papers.each do |paper|
# This part is done offline: generate URLs and filenames.
paper_url = File.join("https://papers.nips.cc", paper['href'])
pdf_url = paper_url + ".pdf"
bibtex_url = paper_url + "/bibtex"
pdf_name = File.join(nips_dir, File.basename(pdf_url))
bibtex_name = pdf_name.sub(".pdf", ".bib")
# Now we're going to create requests for each paper page, PDF and bibtex.
request = Typhoeus::Request.new(paper_url)
request.on_complete do |response|
puts "Analysing #{paper_url}"
if abstract_eligible?(response.body)
pdf_request = Typhoeus::Request.new(pdf_url)
pdf_request.on_complete do |pdf_response|
download_to_file(pdf_response.body, pdf_name)
end
hydra.queue pdf_request
bib_request = Typhoeus::Request.new(bibtex_url)
bib_request.on_complete do |bib_response|
download_to_file(bib_response.body, bibtex_name)
end
hydra.queue bib_request
end
end
hydra.queue request
end
end
hydra.run
end
download_articles(ARGV.first.to_i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment