Skip to content

Instantly share code, notes, and snippets.

Created May 24, 2017 22:17
Show Gist options
  • Save anonymous/1d9eced391183c64993fb606e29d4ac1 to your computer and use it in GitHub Desktop.
Save anonymous/1d9eced391183c64993fb606e29d4ac1 to your computer and use it in GitHub Desktop.
Stats on ICML 2017 accepted papers
import requests
from bs4 import BeautifulSoup
import re
import collections
# use lowercase
CANONICAL_NAMES = {
}
# use lowercase
CANONICAL_INSTITUTIONS = {
'google deepmind': 'deepmind',
'deep mind': 'deepmind',
'Google Research, NY'.lower(): 'google',
'Google Research'.lower(): 'google',
'Google Brain'.lower(): 'google',
'Google Inc.'.lower(): 'google',
'cmu': 'carnegie mellon university',
'microsoft research, india': 'microsoft research',
'ucl': 'university college london',
'iit kanpur': 'indian institute of technology kanpur',
'duke': 'duke university',
'mit': 'massachusetts institute of technology',
'MIT CSAIL'.lower(): 'massachusetts institute of technology',
'nyu': 'New York University'.lower(),
'Georgia Tech'.lower(): 'Georgia Institute of Technology'.lower(),
'Ecole Polytechnique de Montreal'.lower(): 'École Polytechnique de Montréal'.lower(),
}
def canonicalize(name, canonical):
return canonical.get(name.lower(), name.lower())
class LazyLoadingPage(object):
def __init__(self, url):
self.url = url
self._req = None
self._soup = None
@property
def req(self):
if self._req is None:
self._req = requests.get(self.url)
return self._req
@property
def soup(self):
if self._soup is None:
self._soup = BeautifulSoup(self.req.text, "html.parser")
return self._soup
def __str__(self):
return "{cls}(url: {url})".format(
cls=self.__class__.__name__,
url=self.url)
class PapersList(LazyLoadingPage):
@property
def papers(self):
return (Paper(p) for p in self.soup.find('main').find('div', class_='col-xs-9').find_all('p'))
class Paper(object):
def __init__(self, soup):
self.soup = soup
@property
def title(self):
return self.soup.find('b').text
@property
def authors(self):
return (Author(s.strip()) for s in self.soup.find('i').text.split('·'))
@property
def first_author(self):
return next(self.authors)
@property
def last_author(self):
return list(self.authors)[-1]
class Author(object):
def __init__(self, name_inst):
match = re.match(r'([^)]*) \(([^)]*)\)', name_inst)
self.name = canonicalize(match.group(1), CANONICAL_NAMES)
self.institution = canonicalize(match.group(2), CANONICAL_INSTITUTIONS)
def non_empty(seq):
for x in seq:
if len(x) != 0:
yield x
def show_top_n(counter, n=10):
for value, count in counter.most_common(n):
print("{: 3} {}".format(count, value))
def take_n(seq, n=10):
for i, x in enumerate(seq):
if i > n:
break
yield x
ICML_URL = "https://2017.icml.cc/Conferences/2017/AcceptedPapersInitial"
def main():
icml = PapersList(ICML_URL)
print("------------------")
print("Top first authors:\n")
show_top_n(collections.Counter(
paper.first_author.name for paper in icml.papers), 10)
print("------------------")
print("Top last authors:\n")
show_top_n(collections.Counter(
paper.last_author.name for paper in icml.papers), 10)
print("------------------")
print("Top institutions by first author:\n")
show_top_n(collections.Counter(
non_empty(paper.first_author.institution for paper in icml.papers)), 10)
print("------------------")
print("Top institutions by last author:\n")
show_top_n(collections.Counter(
non_empty(paper.last_author.institution for paper in icml.papers)), 10)
print("------------------")
print("Top institutions by all authors:\n")
show_top_n(collections.Counter(
non_empty(author.institution for paper in icml.papers for author in paper.authors)), 10)
print("------------------")
print("Top papers by number of authors:\n")
for paper in take_n(sorted(icml.papers, key=lambda paper: -len(list(paper.authors)))):
print("{: 3} {}".format(len(list(paper.authors)), paper.title))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment