Skip to content

Instantly share code, notes, and snippets.

@neubig
Created May 14, 2021 15:31
Show Gist options
  • Save neubig/246f9021049b23910dbe4e01ba5da86b to your computer and use it in GitHub Desktop.
Save neubig/246f9021049b23910dbe4e01ba5da86b to your computer and use it in GitHub Desktop.
Convert OpenReview IDs to Semantic Scholar Papers
import openreview
import argparse
import requests
import time
import sys
import csv
import json
from tqdm import tqdm # Progress bar
# This is a utility script to get a CSV of papers from semantic scholar given OpenReview ids
parser = argparse.ArgumentParser(description='Description of your program')
parser.add_argument('--username', help='OpenReview username', required=True)
parser.add_argument('--password', help='OpenReview password', required=True)
parser.add_argument('--csv_input', help='A seed csv file, useful if you want to limit queries to s2', default=None)
parser.add_argument('--csv_output', help='Where to output the csv file', default='s2_expertise.csv')
parser.add_argument('--reviewer_list', help='A list of reviewers, one OpenReview ID per line', default='tilde_members.txt')
parser.add_argument('--baseurl', help='url for openreview', default='https://api.openreview.net')
args = parser.parse_args()
or_client = openreview.Client(baseurl=args.baseurl, username=args.username, password=args.password)
def orid_to_s2id(orid):
try:
profile = or_client.get_profile(orid)
s2url = profile.content.get('semanticScholar', None)
s2id = int(s2url.split('/')[-1]) if s2url else None
return s2id
except Exception as e:
print(f'Error getting OR profile for {orid}: {e}', file=sys.stderr)
return None
sleep_time = 1
def query_api(url, session):
global sleep_time
time.sleep(sleep_time / 1000.0)
r = session.get(url)
while r.status_code == 429:
sleep_time *= 2
print(
f'WARNING: Hit rate limit. Increasing sleep to {sleep_time} ms',
file=sys.stderr,
)
time.sleep(sleep_time / 1000.0)
r = session.get(url)
if r.status_code != 200:
print(f'WARNING: Could not access url {url}', file=sys.stderr)
return None
else:
return r.json()
papers_map = {}
if args.csv_input:
with open(args.csv_input, 'r') as csvfile:
for entry in csv.reader(csvfile, delimiter=','):
papers_map[entry[1]] = (entry[2], entry[3])
papers_list = []
with requests.Session() as session, open(args.reviewer_list, 'r') as reviewer_file, open(args.csv_output, 'w', newline='') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=',')
for my_orid in tqdm(reviewer_file):
my_orid = my_orid.strip()
# Get S2 ID from OR ID
my_s2id = orid_to_s2id(my_orid)
if not my_s2id: continue
# Get S2 user
user = query_api(f'http://api.semanticscholar.org/v1/author/{my_s2id}', session)
if not user: continue
# Get S2 IDs
# print(json.dumps(user))
for paper in user['papers']:
my_pid = paper['paperId']
# Retrieve from already-saved papers
if my_pid in papers_map:
my_title, my_abstract = papers_map[my_pid]
csvwriter.writerow((my_orid, my_pid, my_title, my_abstract))
# Retrieve from S2
else:
paper = query_api(f'https://api.semanticscholar.org/v1/paper/{my_pid}', session)
if paper:
# print(json.dumps(paper))
csvwriter.writerow((my_orid, my_pid, paper['title'], paper['abstract']))
papers_map[my_pid] = (paper['title'], paper['abstract'])
print((my_orid, my_pid, paper['title'], paper['abstract']), file=sys.stderr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment