Skip to content

Instantly share code, notes, and snippets.

@JunhongXu
Last active May 12, 2024 15:24
Show Gist options
  • Save JunhongXu/cf27321f710ac3a8c07926b15a916201 to your computer and use it in GitHub Desktop.
Save JunhongXu/cf27321f710ac3a8c07926b15a916201 to your computer and use it in GitHub Desktop.
A Python script downloading all ICLR and NIPS papers from openreview.net
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import requests
import os
def download_all_papers(base_url, save_dir, driver_path):
driver = webdriver.Chrome(driver_path)
driver.get(base_url)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# wait for the select element to become visible
wait = WebDriverWait(driver, 10)
res = wait.until(EC.presence_of_element_located((By.ID, "notes")))
print("Successful load the website!")
# parse the results
divs = driver.find_elements_by_class_name('title_pdf_row')
num_papers = len(divs)
for index, paper in enumerate(divs):
name = paper.find_element_by_class_name('note_content_title').text
link = paper.find_element_by_class_name('note_content_pdf').get_attribute('href')
print('Downloading paper {}/{}: {}'.format(index+1, num_papers, name))
download_pdf(link, os.path.join(save_dir, name))
driver.close()
def download_pdf(url, name):
r = requests.get(url, stream=True)
with open('%s.pdf' % name, 'wb') as f:
for chunck in r.iter_content(1024):
f.write(chunck)
r.close()
if __name__ == '__main__':
NIPS = 'https://openreview.net/group?id=NIPS.cc/2016/Deep_Learning_Symposium'
ICLR = 'https://openreview.net/group?id=ICLR.cc/2017/conference'
driver_path = '/Users/JunhongXu/Desktop/chromedriver'
save_dir_nips = '/Users/JunhongXu/Desktop/papers/nips'
save_dir_iclr = '/Users/JunhongXu/Desktop/papers/iclr'
download_all_papers(NIPS, save_dir_nips, driver_path)
download_all_papers(ICLR, save_dir_iclr, driver_path)
@mdrpanwar
Copy link

The above code and its updated versions and forks have all been outdated due to changes in OpenReview webpages. I have modified this script to work with the current version of OpenReview: https://gist.github.com/mdrpanwar/807de13b54aa169730d24d7026510b56

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment