Last active
August 29, 2015 14:21
-
-
Save senderle/327f52fe094e15af3a8c to your computer and use it in GitHub Desktop.
Citation Simulator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A Citation Simulator that tries to replicate academic citation | |
# patterns. Using pypy is recommended. MIT License. | |
import collections | |
import random | |
import argparse | |
class Article(object): | |
def __init__(self, journal, year): | |
self.journal = journal | |
self.year = year | |
self.citations_given = 1 | |
self.citations_received = 1 | |
def give_citation(self, article): | |
self.citations_given += 1 | |
article.accept_citation(self) | |
def accept_citation(self, article): | |
self.citations_received += 1 | |
@property | |
def n_citations_given(self): | |
return self.citations_given | |
@property | |
def n_citations_received(self): | |
return self.citations_received | |
class Journal(object): | |
def __init__(self, field, citations_per_article, articles_per_issue): | |
self.articles_per_issue = articles_per_issue | |
self.citations_per_article = citations_per_article | |
self.field = field | |
self.issues = collections.defaultdict(list) | |
def new_issue(self, year): | |
issue = [] | |
for i in xrange(self.articles_per_issue): | |
new_article = Article(self, year) | |
self.field.generate_citations(new_article, | |
self.citations_per_article) | |
self.field.add_article(new_article) | |
issue.append(new_article) | |
self.issues[year].append(issue) | |
def get_h_index(self, n_years): | |
years = sorted(self.issues, reverse=True)[:n_years] | |
articles = [ar for y in years | |
for issue in self.issues[y] | |
for ar in issue] | |
cite_map = collections.defaultdict(int) | |
for ar in articles: | |
cite_map[ar.n_citations_received] += 1 | |
total = 0 | |
for n_cites in sorted(cite_map, reverse=True): | |
tier_count = cite_map[n_cites] | |
if tier_count + total > n_cites: | |
break | |
total += tier_count | |
return total | |
def get_n_cites(self, n_years): | |
years = sorted(self.issues, reverse=True)[:n_years] | |
articles = [ar for y in years | |
for issue in self.issues[y] | |
for ar in issue] | |
return sum(ar.n_citations_received for ar in articles) | |
class Field(object): | |
def __init__(self, num_journals, citations_per_article, | |
articles_per_issue, issues_per_year, h_index, | |
bias, decay): | |
self.issues_per_year = issues_per_year | |
self.articles_per_issue = articles_per_issue | |
self.total_citations = 0 | |
self.current_year = 0 | |
self.bias = bias | |
self.decay = decay | |
self.h_index = h_index | |
self.articles = [] | |
self._active_articles = [] | |
self._article_sample_space = [] | |
self.journals = [] | |
for i in xrange(num_journals): | |
self.journals.append(Journal(self, | |
citations_per_article, | |
articles_per_issue)) | |
@property | |
def up_bias(self): | |
return 1 if self.bias < 1 else self.bias | |
@property | |
def down_bias(self): | |
return 1 if self.bias > 1 else self.bias | |
def update_sample_space(self): | |
self._thin_field() | |
self.articles.sort(reverse=True, key=lambda x: x.n_citations_received) | |
self._active_articles[:] = self.articles | |
ss = self._article_sample_space | |
ss[:] = [] | |
for a in self._active_articles: | |
ss.extend([a] * int(a.n_citations_received ** self.down_bias)) | |
def _thin_field(self): | |
self.articles = [a for a in self.articles if | |
random.random() > self.decay] | |
self.total_citations = sum(a.n_citations_received for a in self.articles) | |
def add_article(self, article): | |
self.articles.append(article) | |
self.total_citations += article.n_citations_given | |
def sample(self, n_articles): | |
ss_len = len(self._article_sample_space) | |
samples = set() | |
for i in xrange(n_articles): | |
select = random.random() ** self.up_bias | |
select = int(select * ss_len) | |
samples.add(self._article_sample_space[select]) | |
return samples | |
def generate_citations(self, article, n_citations): | |
if n_citations >= len(self._active_articles): | |
to_cite = self._active_articles | |
else: | |
selected = set() | |
while len(selected) < n_citations: | |
new_citations = n_citations - len(selected) | |
selected.update(self.sample(new_citations)) | |
to_cite = selected | |
for a in to_cite: | |
article.give_citation(a) | |
def simulate_year(self): | |
for issue in xrange(self.issues_per_year): | |
#print "Year {}, Issue {}".format(self.current_year, issue) | |
for journal in self.journals: | |
journal.new_issue(self.current_year) | |
self.update_sample_space() | |
self.current_year += 1 | |
def citation_rate(self): | |
total_cites = sum(journal.get_n_cites(self.h_index) | |
for journal in self.journals) | |
total_articles = (len(self.journals) * | |
self.issues_per_year * | |
self.articles_per_issue * | |
self.h_index) | |
return float(total_cites) / total_articles | |
def top_h_indices(self, n=None): | |
h_indices = [(journal.get_h_index(self.h_index), journal) | |
for journal in self.journals] | |
h_indices.sort(reverse=True) | |
return h_indices[0:n] | |
def simulate(n_years, *args, **kwargs): | |
field_one = Field(*args, **kwargs) | |
top_h5_sum = 0.0 | |
med_h5_sum = 0.0 | |
mean_h5_sum = 0.0 | |
for year in xrange(n_years): | |
field_one.simulate_year() | |
h_indices = [h for h, j in field_one.top_h_indices()] | |
top_h5_sum += h_indices[0] | |
med_h5_sum += h_indices[len(h_indices) / 2] | |
mean_h5_sum += float(sum(h_indices)) / len(h_indices) | |
print "Year {}".format(year) | |
print "Citation rate for field: {}".format(field_one.citation_rate()) | |
print "Top H Indices:" | |
print h_indices[0:50] | |
sample_indices = [int(len(field_one.articles) * 0.1 * i) for i in xrange(10)] | |
print "Most citations per article within each decile of active articles:" | |
print [field_one.articles[i].n_citations_received for i in sample_indices] | |
print "Average yearly top journal H index:", | |
print top_h5_sum / n_years | |
print "Average yearly median journal H index:", | |
print med_h5_sum / n_years | |
print "Average yearly mean journal H index:", | |
print mean_h5_sum / n_years | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description="Citation simulator.") | |
parser.add_argument('-j', '--num-journals', type=int, help='Number of journals.') | |
parser.add_argument('-c', '--citations-per-article', type=int, help='Citations per article.') | |
parser.add_argument('-a', '--articles-per-issue', type=int, help='Articles per issue.') | |
parser.add_argument('-i', '--issues-per-year', type=int, help='Issues per year.') | |
parser.add_argument('-x', '--h-index', type=int, help='H index value. Size of the citation ' | |
'window in years.') | |
parser.add_argument('-b', '--bias', type=float, help='Bias towards highly-cited articles. ' | |
'Should be a positive floating point number. 1.0 is a standard rich-get-richer bias; ' | |
'lower is less biased towards rich, higher is more biased.') | |
parser.add_argument('-d', '--decay', type=float, help='Probability that an article will ' | |
'drop out of the field of citable articles next year. Should be a value between ' | |
'0.0 and 1.0.') | |
parser.set_defaults(num_journals=500, citations_per_article=10, articles_per_issue=10, | |
issues_per_year=5, h_index=5, bias=1, decay=0.75) | |
args = parser.parse_args() | |
simulate(30, **vars(args)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment