Skip to content

Instantly share code, notes, and snippets.

@senderle
Last active August 29, 2015 14:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save senderle/327f52fe094e15af3a8c to your computer and use it in GitHub Desktop.
Save senderle/327f52fe094e15af3a8c to your computer and use it in GitHub Desktop.
Citation Simulator
# A Citation Simulator that tries to replicate academic citation
# patterns. Using pypy is recommended. MIT License.
import collections
import random
import argparse
class Article(object):
def __init__(self, journal, year):
self.journal = journal
self.year = year
self.citations_given = 1
self.citations_received = 1
def give_citation(self, article):
self.citations_given += 1
article.accept_citation(self)
def accept_citation(self, article):
self.citations_received += 1
@property
def n_citations_given(self):
return self.citations_given
@property
def n_citations_received(self):
return self.citations_received
class Journal(object):
def __init__(self, field, citations_per_article, articles_per_issue):
self.articles_per_issue = articles_per_issue
self.citations_per_article = citations_per_article
self.field = field
self.issues = collections.defaultdict(list)
def new_issue(self, year):
issue = []
for i in xrange(self.articles_per_issue):
new_article = Article(self, year)
self.field.generate_citations(new_article,
self.citations_per_article)
self.field.add_article(new_article)
issue.append(new_article)
self.issues[year].append(issue)
def get_h_index(self, n_years):
years = sorted(self.issues, reverse=True)[:n_years]
articles = [ar for y in years
for issue in self.issues[y]
for ar in issue]
cite_map = collections.defaultdict(int)
for ar in articles:
cite_map[ar.n_citations_received] += 1
total = 0
for n_cites in sorted(cite_map, reverse=True):
tier_count = cite_map[n_cites]
if tier_count + total > n_cites:
break
total += tier_count
return total
def get_n_cites(self, n_years):
years = sorted(self.issues, reverse=True)[:n_years]
articles = [ar for y in years
for issue in self.issues[y]
for ar in issue]
return sum(ar.n_citations_received for ar in articles)
class Field(object):
def __init__(self, num_journals, citations_per_article,
articles_per_issue, issues_per_year, h_index,
bias, decay):
self.issues_per_year = issues_per_year
self.articles_per_issue = articles_per_issue
self.total_citations = 0
self.current_year = 0
self.bias = bias
self.decay = decay
self.h_index = h_index
self.articles = []
self._active_articles = []
self._article_sample_space = []
self.journals = []
for i in xrange(num_journals):
self.journals.append(Journal(self,
citations_per_article,
articles_per_issue))
@property
def up_bias(self):
return 1 if self.bias < 1 else self.bias
@property
def down_bias(self):
return 1 if self.bias > 1 else self.bias
def update_sample_space(self):
self._thin_field()
self.articles.sort(reverse=True, key=lambda x: x.n_citations_received)
self._active_articles[:] = self.articles
ss = self._article_sample_space
ss[:] = []
for a in self._active_articles:
ss.extend([a] * int(a.n_citations_received ** self.down_bias))
def _thin_field(self):
self.articles = [a for a in self.articles if
random.random() > self.decay]
self.total_citations = sum(a.n_citations_received for a in self.articles)
def add_article(self, article):
self.articles.append(article)
self.total_citations += article.n_citations_given
def sample(self, n_articles):
ss_len = len(self._article_sample_space)
samples = set()
for i in xrange(n_articles):
select = random.random() ** self.up_bias
select = int(select * ss_len)
samples.add(self._article_sample_space[select])
return samples
def generate_citations(self, article, n_citations):
if n_citations >= len(self._active_articles):
to_cite = self._active_articles
else:
selected = set()
while len(selected) < n_citations:
new_citations = n_citations - len(selected)
selected.update(self.sample(new_citations))
to_cite = selected
for a in to_cite:
article.give_citation(a)
def simulate_year(self):
for issue in xrange(self.issues_per_year):
#print "Year {}, Issue {}".format(self.current_year, issue)
for journal in self.journals:
journal.new_issue(self.current_year)
self.update_sample_space()
self.current_year += 1
def citation_rate(self):
total_cites = sum(journal.get_n_cites(self.h_index)
for journal in self.journals)
total_articles = (len(self.journals) *
self.issues_per_year *
self.articles_per_issue *
self.h_index)
return float(total_cites) / total_articles
def top_h_indices(self, n=None):
h_indices = [(journal.get_h_index(self.h_index), journal)
for journal in self.journals]
h_indices.sort(reverse=True)
return h_indices[0:n]
def simulate(n_years, *args, **kwargs):
field_one = Field(*args, **kwargs)
top_h5_sum = 0.0
med_h5_sum = 0.0
mean_h5_sum = 0.0
for year in xrange(n_years):
field_one.simulate_year()
h_indices = [h for h, j in field_one.top_h_indices()]
top_h5_sum += h_indices[0]
med_h5_sum += h_indices[len(h_indices) / 2]
mean_h5_sum += float(sum(h_indices)) / len(h_indices)
print "Year {}".format(year)
print "Citation rate for field: {}".format(field_one.citation_rate())
print "Top H Indices:"
print h_indices[0:50]
sample_indices = [int(len(field_one.articles) * 0.1 * i) for i in xrange(10)]
print "Most citations per article within each decile of active articles:"
print [field_one.articles[i].n_citations_received for i in sample_indices]
print "Average yearly top journal H index:",
print top_h5_sum / n_years
print "Average yearly median journal H index:",
print med_h5_sum / n_years
print "Average yearly mean journal H index:",
print mean_h5_sum / n_years
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Citation simulator.")
parser.add_argument('-j', '--num-journals', type=int, help='Number of journals.')
parser.add_argument('-c', '--citations-per-article', type=int, help='Citations per article.')
parser.add_argument('-a', '--articles-per-issue', type=int, help='Articles per issue.')
parser.add_argument('-i', '--issues-per-year', type=int, help='Issues per year.')
parser.add_argument('-x', '--h-index', type=int, help='H index value. Size of the citation '
'window in years.')
parser.add_argument('-b', '--bias', type=float, help='Bias towards highly-cited articles. '
'Should be a positive floating point number. 1.0 is a standard rich-get-richer bias; '
'lower is less biased towards rich, higher is more biased.')
parser.add_argument('-d', '--decay', type=float, help='Probability that an article will '
'drop out of the field of citable articles next year. Should be a value between '
'0.0 and 1.0.')
parser.set_defaults(num_journals=500, citations_per_article=10, articles_per_issue=10,
issues_per_year=5, h_index=5, bias=1, decay=0.75)
args = parser.parse_args()
simulate(30, **vars(args))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment