Skip to content

Instantly share code, notes, and snippets.

@JunhongXu
Last active May 12, 2024 15:24
Show Gist options
  • Save JunhongXu/cf27321f710ac3a8c07926b15a916201 to your computer and use it in GitHub Desktop.
Save JunhongXu/cf27321f710ac3a8c07926b15a916201 to your computer and use it in GitHub Desktop.
A Python script downloading all ICLR and NIPS papers from openreview.net
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import requests
import os
def download_all_papers(base_url, save_dir, driver_path):
driver = webdriver.Chrome(driver_path)
driver.get(base_url)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# wait for the select element to become visible
wait = WebDriverWait(driver, 10)
res = wait.until(EC.presence_of_element_located((By.ID, "notes")))
print("Successful load the website!")
# parse the results
divs = driver.find_elements_by_class_name('title_pdf_row')
num_papers = len(divs)
for index, paper in enumerate(divs):
name = paper.find_element_by_class_name('note_content_title').text
link = paper.find_element_by_class_name('note_content_pdf').get_attribute('href')
print('Downloading paper {}/{}: {}'.format(index+1, num_papers, name))
download_pdf(link, os.path.join(save_dir, name))
driver.close()
def download_pdf(url, name):
r = requests.get(url, stream=True)
with open('%s.pdf' % name, 'wb') as f:
for chunck in r.iter_content(1024):
f.write(chunck)
r.close()
if __name__ == '__main__':
NIPS = 'https://openreview.net/group?id=NIPS.cc/2016/Deep_Learning_Symposium'
ICLR = 'https://openreview.net/group?id=ICLR.cc/2017/conference'
driver_path = '/Users/JunhongXu/Desktop/chromedriver'
save_dir_nips = '/Users/JunhongXu/Desktop/papers/nips'
save_dir_iclr = '/Users/JunhongXu/Desktop/papers/iclr'
download_all_papers(NIPS, save_dir_nips, driver_path)
download_all_papers(ICLR, save_dir_iclr, driver_path)
@EmmaNguyen
Copy link

Hi Junhongxu, I've been trying to re-use your code download all submission papers from ICLR this year. Seems like a structure of website makes some changes. Especially from line 21 it's cracked. Could you please help me check this? Thanks

@MSC19950601
Copy link

same problem

@xSakix
Copy link

xSakix commented Sep 3, 2019

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import requests
import os
#https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename
from slugify import slugify


def download_all_papers(base_url, save_dir, driver_path):
    driver = webdriver.Chrome(driver_path)
    driver.get(base_url)

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # wait for the select element to become visible
    print('Starting web driver wait...')
    wait = WebDriverWait(driver, 10)
    print('Starting web driver wait... finished')
    res = wait.until(EC.presence_of_element_located((By.ID, "notes")))
    print("Successful load the website!->",res)
    res = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "note")))
    print("Successful load the website notes!->",res)
    # parse the results
    divs = driver.find_elements_by_class_name('note')
    num_papers = len(divs)
    print('found number of papers:',num_papers)
    for index, paper in enumerate(divs):
        a_hrefs = paper.find_elements_by_tag_name("a")
        name = slugify(a_hrefs[0].text)
        link = a_hrefs[1].get_attribute('href')
#        name = paper.find_element_by_class_name('note_content_title').text
#        link = paper.find_element_by_class_name('note_content_pdf').get_attribute('href')
        print('Downloading paper {}/{}: {}'.format(index+1, num_papers, name))
        download_pdf(link, os.path.join(save_dir, name))
    driver.close()


def download_pdf(url, name):
    r = requests.get(url, stream=True)

    with open('%s.pdf' % name, 'wb') as f:
        for chunck in r.iter_content(1024):
            f.write(chunck)
    r.close()


if __name__ == '__main__':
    ICLR = 'https://openreview.net/group?id=ICLR.cc/2018/Conference'
    driver_path = '<path to chrome driver>'
    save_dir_nips = '.'
    save_dir_iclr = '.'

    download_all_papers(ICLR, save_dir_iclr, driver_path)

@rrryan2016
Copy link

Guess the code outdated (for ICLR paper download) ?

When trying to download ICLR 2017(https://openreview.net/group?id=ICLR.cc/2017/conference) and 2021(https://openreview.net/group?id=ICLR.cc/2021/Conference), num_papers turn to be 0, and download will not get started.

@mdrpanwar
Copy link

The above code and its updated versions and forks have all been outdated due to changes in OpenReview webpages. I have modified this script to work with the current version of OpenReview: https://gist.github.com/mdrpanwar/807de13b54aa169730d24d7026510b56

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment