Skip to content

Instantly share code, notes, and snippets.

@csarron
Created July 22, 2020 19:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save csarron/2b37586e2d5518ab34fd2cd160eb8702 to your computer and use it in GitHub Desktop.
Save csarron/2b37586e2d5518ab34fd2cd160eb8702 to your computer and use it in GitHub Desktop.
Get openness statistics of conferences from DBLP
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Get openness statistics of top conferences, motivated by
http://s3.eurecom.fr/~balzarot/notes/inbreeding/inbreeding.html
install: pip install requests matplotlib
usage example:
python calc_openness.py MobiCom MobiSys SenSys --plot --save_dir mobile
python calc_openness.py SOSP OSDI EuroSys USENIX FAST ASPLOS NSDI --plot
python calc_openness.py ACL EMNLP NAACL --plot --save_dir nlp
python calc_openness.py CVPR ICCV ECCV --plot --save_dir cv
"""
__author__ = "Qingqing Cao, https://awk.ai/, Twitter@sysnlp"
__copyright__ = "Copyright 2020, MIT LICENSE"
import argparse
import json
import os
from collections import defaultdict
import requests
_API_BASE = 'https://dblp.org/search/publ/api?'
_API_TEMPLATE = _API_BASE + 'q=/conf/{}/{}&format=json&h=1000'
author_record = defaultdict(dict)
# conference names can be found at https://dblp.org/db/conf/
def gen_author_flags(conf, year_str=''):
years = set()
api = _API_TEMPLATE.format(conf.lower(), year_str)
response = requests.get(api)
if not response.ok:
print('request failed, try again or check api endpoint!')
return -1
data = json.loads(response.content)
hits = data['result']['hits']
all_hits = hits['hit']
paper_authors_map = dict()
for item in all_hits:
info = item['info']
authors = info.get('authors', None)
if authors is None:
# skip no author record
continue
venue = info.get('venue', None)
if venue is None:
continue
year = int(info['year'])
years.add(year)
all_authors = authors['author']
if isinstance(all_authors, dict):
record_authors = [all_authors]
else:
assert isinstance(all_authors, list)
record_authors = all_authors
paper_key = info['key']
paper_authors = set()
for author in record_authors:
author_id = author['@pid']
paper_authors.add(author_id)
author_years = author_record[author_id].get(conf, set())
author_years.add(year)
author_record[author_id][conf] = author_years
if isinstance(venue, list):
venue = venue[0]
if conf in venue: # only consider conf venue
paper_authors_map[paper_key] = paper_authors
return years, paper_authors_map
def main(args):
conferences = args.conferences
conf_old_papers = defaultdict(dict)
conf_new_papers = defaultdict(dict)
all_old = defaultdict(dict)
all_new = defaultdict(dict)
for conf in conferences:
years, _ = gen_author_flags(conf)
conf_years = sorted(list(years), reverse=True)
for conf_year in conf_years:
if conf_year < args.after_years:
continue
_, paper_authors_map = gen_author_flags(
conf, conf_year)
num_year_papers = len(paper_authors_map)
if num_year_papers == 0:
continue
num_old_author_paper = 0
for paper_key, paper_authors in paper_authors_map.items():
is_old_author = False
for paper_author in paper_authors:
# if one of the author published in previous years,
# it is old-author paper
author_years = author_record[paper_author][conf]
is_old_author = min(author_years) < conf_year
if is_old_author:
break
num_old_author_paper += 1 if is_old_author else 0
num_new_author_papers = num_year_papers - num_old_author_paper
conf_old_papers[conf][conf_year] = num_old_author_paper
conf_new_papers[conf][conf_year] = num_new_author_papers
print(conf, conf_year, num_old_author_paper, num_new_author_papers)
all_old[conf_year][conf] = num_old_author_paper
all_new[conf_year][conf] = num_new_author_papers
# conf stats for all years
print('all combined:')
x = []
y = []
for year in sorted(all_old.keys(), reverse=True):
x.append(year)
num_old = sum(all_old[year].values())
num_new = sum(all_new[year].values())
frac = num_new / (num_old + num_new)
y.append(frac)
print(year, frac, num_new, num_old)
if args.plot:
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
cnt = 0
save_dir = args.save_dir
os.makedirs(save_dir, exist_ok=True)
for key in conf_new_papers.keys():
old_data = conf_old_papers[key]
new_data = conf_new_papers[key]
x = []
y = []
for year in sorted(old_data.keys(), reverse=True):
x.append(year)
y.append(new_data[year] / (new_data[year] + old_data[year]))
plt.figure(cnt)
cnt += 1
plt.plot(x, y, '-o')
plt.xticks(x, rotation=45)
plt.yticks([i / 10 for i in range(0, 11)])
plt.ylim(0, 1)
plt.grid(True)
plt.title("{} Fraction of Papers from New Authors".format(key))
plt.savefig(os.path.join(save_dir, '{}.png'.format(key)))
plt.show()
plt.figure(cnt)
plt.plot(x, y, '-o')
plt.xticks(x, rotation=45)
plt.yticks([i / 10 for i in range(0, 11)])
plt.ylim(0, 1)
plt.grid(True)
plt.title("Fraction of Papers from New Authors")
plt.savefig(os.path.join(save_dir, 'combined.png'))
plt.show()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('conferences', type=str, nargs='+',
help='list of conferences separate by space')
parser.add_argument("--plot", action="store_true",
help="if true, plot stats")
parser.add_argument("--after_years", type=int, default=2000,
help="get data after the year, set to 0"
"to get all avaiable dblp data")
parser.add_argument("--save_dir", type=str, default='.',
help="dir to save plot image")
main(parser.parse_args())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment