Skip to content

Instantly share code, notes, and snippets.

@fgolemo
Last active January 17, 2022 19:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fgolemo/ef4526cb4d58bb6e94f94e85c9a04be4 to your computer and use it in GitHub Desktop.
Save fgolemo/ef4526cb4d58bb6e94f94e85c9a04be4 to your computer and use it in GitHub Desktop.
Crawl ICLR 22 review scores, confidences, titles, links for visualization.
### ICLR 2022 Scraper
import os.path
import pickle
import urllib.request
import json
from math import ceil
from multiprocessing import Queue, Process
from tqdm import trange, tqdm
URL_TITLES = "https://api.openreview.net/notes?details=replyCount%2Cinvitation%2Coriginal&offset={start}&limit=50&invitation=ICLR.cc%2F2022%2FConference%2F-%2FBlind_Submission"
URL_REVIEWS = "https://api.openreview.net/notes?forum={paper_id}&trash=true&details=replyCount%2Cwritable%2Crevisions%2Coriginal%2Coverwriting%2Cinvitation%2Ctags"
TOTAL_SUBMISSIONS = 2855 # from pulling the URL once manually and looking at the count
PROCESSES = 5 # how many workers are pulling reviews
paper_data = {
"paper_ids": [],
"paper_links": [],
"paper_titles": [],
}
pages = ceil(TOTAL_SUBMISSIONS / 50)
for p_no in trange(pages):
with urllib.request.urlopen(URL_TITLES.format(start=p_no * 50)) as url:
data = json.loads(url.read().decode())
for note in data["notes"]:
paper_id = note["id"]
paper_title = note["content"]["title"]
paper_link = f"https://openreview.net/forum?id={paper_id}"
paper_data["paper_ids"].append(paper_id)
paper_data["paper_titles"].append(paper_title)
paper_data["paper_links"].append(paper_link)
def pull_reviews(q, paper_ids, paper_titles, paper_links):
for paper_idx, paper_id in enumerate(paper_ids):
paper_score = []
paper_confs = []
with urllib.request.urlopen(URL_REVIEWS.format(paper_id=paper_id)) as url:
data = json.loads(url.read().decode())
for note in data["notes"]:
if "recommendation" in note["content"].keys():
paper_rec = note["content"]["recommendation"]
paper_rec = paper_rec.split(":")[0]
paper_score.append(int(paper_rec))
paper_conf = note["content"]["confidence"]
paper_conf = paper_conf.split(":")[0]
paper_confs.append(int(paper_conf))
q.put([paper_id, paper_titles[paper_idx], paper_links[paper_idx], paper_score, paper_confs])
chunk_len = int(len(paper_data["paper_ids"]) / PROCESSES)
def make_chunks(lst):
return [lst[i : i + chunk_len] for i in range(0, len(lst), chunk_len)]
paper_id_chunks = make_chunks(paper_data["paper_ids"])
paper_title_chunks = make_chunks(paper_data["paper_titles"])
paper_link_chunks = make_chunks(paper_data["paper_links"])
q = Queue()
procs = []
for proc_id in range(PROCESSES):
p = Process(
target=pull_reviews, args=(q, paper_id_chunks[proc_id], paper_title_chunks[proc_id], paper_link_chunks[proc_id])
)
p.start()
procs.append(p)
paper_data2 = {
"paper_ids": [],
"paper_links": [],
"paper_titles": [],
"paper_scores": [],
"paper_confidences": [],
"paper_scores_mean": [],
"paper_confidences_mean": [],
}
with tqdm(total=TOTAL_SUBMISSIONS) as pbar:
counter = 0
while True:
paper_id, paper_title, paper_link, paper_scores, paper_confs = q.get(block=True)
paper_data2["paper_ids"].append(paper_id)
paper_data2["paper_titles"].append(paper_title)
paper_data2["paper_links"].append(paper_link)
paper_data2["paper_scores"].append(paper_scores)
paper_data2["paper_scores_mean"].append(sum(paper_scores) / len(paper_scores))
paper_data2["paper_confidences"].append(paper_confs)
paper_data2["paper_confidences_mean"].append(sum(paper_confs) / len(paper_confs))
counter += 1
if counter % 100 == 0:
print(len(paper_data2["paper_ids"]))
pickle.dump(paper_data2, open(os.path.expanduser("~/iclr22-papers.pickle"), "wb"))
pbar.update(100)
if counter == TOTAL_SUBMISSIONS:
pickle.dump(paper_data2, open(os.path.expanduser("~/iclr22-papers.pickle"), "wb"))
print("done")
break
for p in procs:
p.join()
@ankitkv
Copy link

ankitkv commented Dec 5, 2021

Hi! I think you forgot to initialize paper_data2["paper_confidences_mean"] to [].

@fgolemo
Copy link
Author

fgolemo commented Jan 17, 2022

bit late, but yeah that's right. Thanks for pointing that out @ankitkv ! 😄

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment