Created
May 24, 2017 22:17
-
-
Save anonymous/1d9eced391183c64993fb606e29d4ac1 to your computer and use it in GitHub Desktop.
Stats on ICML 2017 accepted papers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import re | |
import collections | |
# use lowercase | |
CANONICAL_NAMES = { | |
} | |
# use lowercase | |
CANONICAL_INSTITUTIONS = { | |
'google deepmind': 'deepmind', | |
'deep mind': 'deepmind', | |
'Google Research, NY'.lower(): 'google', | |
'Google Research'.lower(): 'google', | |
'Google Brain'.lower(): 'google', | |
'Google Inc.'.lower(): 'google', | |
'cmu': 'carnegie mellon university', | |
'microsoft research, india': 'microsoft research', | |
'ucl': 'university college london', | |
'iit kanpur': 'indian institute of technology kanpur', | |
'duke': 'duke university', | |
'mit': 'massachusetts institute of technology', | |
'MIT CSAIL'.lower(): 'massachusetts institute of technology', | |
'nyu': 'New York University'.lower(), | |
'Georgia Tech'.lower(): 'Georgia Institute of Technology'.lower(), | |
'Ecole Polytechnique de Montreal'.lower(): 'École Polytechnique de Montréal'.lower(), | |
} | |
def canonicalize(name, canonical): | |
return canonical.get(name.lower(), name.lower()) | |
class LazyLoadingPage(object): | |
def __init__(self, url): | |
self.url = url | |
self._req = None | |
self._soup = None | |
@property | |
def req(self): | |
if self._req is None: | |
self._req = requests.get(self.url) | |
return self._req | |
@property | |
def soup(self): | |
if self._soup is None: | |
self._soup = BeautifulSoup(self.req.text, "html.parser") | |
return self._soup | |
def __str__(self): | |
return "{cls}(url: {url})".format( | |
cls=self.__class__.__name__, | |
url=self.url) | |
class PapersList(LazyLoadingPage): | |
@property | |
def papers(self): | |
return (Paper(p) for p in self.soup.find('main').find('div', class_='col-xs-9').find_all('p')) | |
class Paper(object): | |
def __init__(self, soup): | |
self.soup = soup | |
@property | |
def title(self): | |
return self.soup.find('b').text | |
@property | |
def authors(self): | |
return (Author(s.strip()) for s in self.soup.find('i').text.split('·')) | |
@property | |
def first_author(self): | |
return next(self.authors) | |
@property | |
def last_author(self): | |
return list(self.authors)[-1] | |
class Author(object): | |
def __init__(self, name_inst): | |
match = re.match(r'([^)]*) \(([^)]*)\)', name_inst) | |
self.name = canonicalize(match.group(1), CANONICAL_NAMES) | |
self.institution = canonicalize(match.group(2), CANONICAL_INSTITUTIONS) | |
def non_empty(seq): | |
for x in seq: | |
if len(x) != 0: | |
yield x | |
def show_top_n(counter, n=10): | |
for value, count in counter.most_common(n): | |
print("{: 3} {}".format(count, value)) | |
def take_n(seq, n=10): | |
for i, x in enumerate(seq): | |
if i > n: | |
break | |
yield x | |
ICML_URL = "https://2017.icml.cc/Conferences/2017/AcceptedPapersInitial" | |
def main(): | |
icml = PapersList(ICML_URL) | |
print("------------------") | |
print("Top first authors:\n") | |
show_top_n(collections.Counter( | |
paper.first_author.name for paper in icml.papers), 10) | |
print("------------------") | |
print("Top last authors:\n") | |
show_top_n(collections.Counter( | |
paper.last_author.name for paper in icml.papers), 10) | |
print("------------------") | |
print("Top institutions by first author:\n") | |
show_top_n(collections.Counter( | |
non_empty(paper.first_author.institution for paper in icml.papers)), 10) | |
print("------------------") | |
print("Top institutions by last author:\n") | |
show_top_n(collections.Counter( | |
non_empty(paper.last_author.institution for paper in icml.papers)), 10) | |
print("------------------") | |
print("Top institutions by all authors:\n") | |
show_top_n(collections.Counter( | |
non_empty(author.institution for paper in icml.papers for author in paper.authors)), 10) | |
print("------------------") | |
print("Top papers by number of authors:\n") | |
for paper in take_n(sorted(icml.papers, key=lambda paper: -len(list(paper.authors)))): | |
print("{: 3} {}".format(len(list(paper.authors)), paper.title)) | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment