Skip to content

Instantly share code, notes, and snippets.

@mdrpanwar
Created August 16, 2022 12:09
Show Gist options
  • Save mdrpanwar/807de13b54aa169730d24d7026510b56 to your computer and use it in GitHub Desktop.
Save mdrpanwar/807de13b54aa169730d24d7026510b56 to your computer and use it in GitHub Desktop.
A Python script for downloading papers from openreview.net given a URL.
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
def download_all_papers(base_url, save_dir, driver_path):
driver = webdriver.Chrome(driver_path)
driver.get(base_url)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# wait for the select element to become visible
WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, "submissions-list")))
pagination_elems = driver.find_element(By.CSS_SELECTOR, "ul[class='pagination']").find_elements_by_tag_name("li")
max_pages = len(pagination_elems) - 4 # remove unnumbered navigation elements: <<, <, >, >>
i = 1
while True:
print("Successfully loaded the page " + str(i))
# parse the results
divs = driver.find_elements_by_class_name('note')
num_papers = len(divs)
print(num_papers)
for index, paper in enumerate(divs):
anchors = paper.find_elements_by_tag_name("a")
name = anchors[0].text
name = name.replace(":", " -").replace("?", "").replace("\"", "").replace("\'", "").replace("*", "")
link = anchors[1].get_attribute('href')
print('Downloading paper {}/{}: {}'.format(index+1, num_papers, name))
download_pdf(link, os.path.join(save_dir, name))
# switch to next page and increment i
if i == max_pages:
break
i += 1
dataPageNumberString = "[data-page-number='" + str(i) + "']"
click_elem_li = driver.find_element(By.CSS_SELECTOR, "li[class=' right-arrow']" + dataPageNumberString)
click_elem_li.find_elements_by_tag_name("a")[0].click()
WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, "submissions-list")))
time.sleep(10)
driver.close()
def download_pdf(url, name):
r = requests.get(url, stream=True)
with open('%s.pdf' % name, 'wb') as f:
for chunck in r.iter_content(1024):
f.write(chunck)
r.close()
if __name__ == '__main__':
NIPS = 'https://openreview.net/group?id=NeurIPS.cc/2022/Track/Datasets_and_Benchmarks'
driver_path = 'C:\\Users\\username\\path\\to\\chromedriver.exe'
save_dir_nips = 'C:\\Users\\username\\path\\to\\save\\downloaded\\papers'
download_all_papers(NIPS, save_dir_nips, driver_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment