Created
November 24, 2022 11:17
-
-
Save skojaku/8494552b3012d047f6555b5f322e3eaf to your computer and use it in GitHub Desktop.
Long-term citation model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# @Author: Sadamori Kojaku | |
# @Date: 2022-11-22 21:18:08 | |
# @Last Modified by: Sadamori Kojaku | |
# @Last Modified time: 2022-11-24 06:17:51 | |
from functools import partial | |
import numpy as np | |
import pandas as pd | |
from scipy import optimize, sparse, stats | |
from tqdm import tqdm | |
class LongTermCitationModel: | |
"""Python implementation of the long-term citation model. | |
Usage: | |
>>A = sparse.csr_matrix(citation) # citation network, where A[i,j]=1 indicates a citation from paper i to paper j. | |
>>model = LongTermCitationModel(min_ct=10) # create the model | |
>>model.fit(A, t_pubs) # fitting. t_pubs is a numpy array of publication years | |
>>print(model.mu, model.sigma) # Print the estimated parameters | |
>>model.predict(A, t_pubs, 2010, 2020, m=30) # predict citation count from 2010 to 2020 | |
Note: | |
While the original implementation is based on a grid search, | |
I used a gradient descent algorithm to fit this model. | |
Having cross-checked the log-likelihood function, the gradient descent | |
algorithm yields a better fit. | |
References: | |
1. Wang, Dashun, Chaoming Song, and Albert-László Barabási. “Quantifying Long-Term Scientific Impact.” Science 342, no. 6154 (October 4, 2013): 127–32. https://doi.org/10.1126/science.1237825. | |
2. 2015 - Ke - Qualifying Exam Report, Part 1: On the universal rescaled citation dynamics of individual papers | |
""" | |
def __init__(self, min_ct=10, min_sigma=1e-2): | |
"""LongTermCitation Model | |
:param min_ct: minimum number of citations needed to make reliable inference, defaults to 10 | |
:type min_ct: int, optional | |
:param min_sigma: lower bound of the concentration parameter to prevent divergence, defaults to 1e-2 | |
:type min_sigma: float, optional | |
""" | |
self.ltcm = LongTermCitationModelForPaper(min_ct=min_ct, min_sigma=min_sigma) | |
self.mu = None | |
self.sigma = None | |
self.eta = None | |
self.q = None | |
def fit(self, net, t_pub, **params): | |
"""_summary_ | |
:param net: Citation network. net[i,j] = 1 indicates a citation from i to j. | |
:type net: sparse.csr_matrix | |
:param t_pub: t_pub[i] indicates the publication year of paper i | |
:type t_pub: numpy.narray | |
:return: self | |
:rtype: self | |
""" | |
n_nodes = net.shape[0] | |
src, trg, _ = sparse.find(net) | |
dt = t_pub[src] - t_pub[trg] | |
dt = np.round(dt).astype(int) | |
s = dt >= 0 | |
src, trg, dt = src[s], trg[s], dt[s] | |
# Citation event time matrix in csr_matrix format | |
# This is to exploit the csr_matrx structure to speed up the | |
# access to the precomputed paper age sequence. Sepcifically, | |
# - ET.data[ET.indptr[i]:ET.indptr[i+1]] will give the sequence of paper ages when cited. | |
# - ET.indices[ET.indptr[i]:ET.indptr[i+1]] will give the sequence of papers that cite i. | |
ET = sparse.csr_matrix((dt, (trg, src)), shape=net.shape) | |
# | |
# Main | |
# | |
mu, sigma, eta, q = ( | |
np.zeros(n_nodes), | |
np.zeros(n_nodes), | |
np.zeros(n_nodes), | |
np.zeros(n_nodes), | |
) | |
for i in tqdm(range(n_nodes)): | |
dt = ET.data[ET.indptr[i] : ET.indptr[i + 1]] | |
mu[i], sigma[i], eta[i], q[i] = self.ltcm.fit(dt, t_pub=0) | |
self.mu = mu | |
self.sigma = sigma | |
self.eta = eta | |
self.q = q | |
return self | |
def predict(self, net, t_pub, t_pred_start, t_pred_end, m=30, **params): | |
"""Predict citations by the long-term citation model | |
:param net: network | |
:type net: scipy.sparse | |
:param t_pub: Publication year | |
:type t_pub: numpy.ndarray | |
:param t_pred_start: Starting time of the simulation | |
:type t_pred_start: float | |
:param t_pred_end: Ending time of the simulation | |
:type t_pred_end: float | |
:param mu: mu parameter for the log-normal distribution | |
:type mu: numpy.ndarray | |
:param sigma: concentration parameter for the log-normal distribution | |
:type sigma: numpy.ndarray | |
:param eta: Fitness | |
:type eta: numpy.ndarray | |
:param m: Initial attractiveness, defaults to 30 | |
:type m: int, optional | |
:return: Predicted number of citations and timestamps | |
:rtype: pred_ct: (num_nodes, num_time) sparse.csr_matrix, timestamp: (num_time), numpy.ndarray | |
""" | |
ct = np.array(net.sum(axis=0)).reshape(-1) | |
n_nodes = net.shape[0] | |
cited, t_cited = [], [] | |
for i in range(n_nodes): | |
if pd.isna(self.mu[i]): | |
continue | |
dt_pred = self.ltcm.predict( | |
t_pub=t_pub[i], | |
t_pred_start=t_pred_start, | |
t_pred_end=t_pred_end, | |
ct=ct[i], | |
eta=self.eta[i], | |
mu=self.mu[i], | |
sigma=self.sigma[i], | |
m=m, | |
) | |
if len(dt_pred) == 0: | |
continue | |
dt_pred = np.round(dt_pred).astype(int) | |
t_cited.append(dt_pred) | |
cited.append(np.ones_like(dt_pred) * i) | |
n_t = int(t_pred_end - t_pred_start) + 1 | |
timestamp = np.arange(int(t_pred_start), int(t_pred_end) + 1) | |
if len(cited) == 0: | |
pred_ct = sparse.csr_matrix( | |
([0], (0, 0)), | |
shape=(n_nodes, n_t), | |
) | |
return pred_ct, timestamp | |
cited = np.concatenate(cited) | |
t_cited = np.concatenate(t_cited) | |
# ut_cited, tids = np.unique(t_cited, return_inverse=True) | |
tids = (t_cited - t_pred_start).astype(int) | |
# n_t = len(ut_cited) | |
pred_ct = sparse.csr_matrix( | |
(np.ones_like(tids), (cited, tids)), | |
shape=(n_nodes, n_t), | |
) | |
return pred_ct, timestamp | |
class LongTermCitationModelForPaper: | |
"""Python implementation of the long-term citation model. | |
References: | |
- https://www.science.org/doi/full/10.1126/science.1237825 | |
- 2015 - Ke - Qualifying Exam Report, Part 1: On the universal rescaled citation dynamics of individual papers | |
""" | |
def __init__(self, min_ct=10, min_sigma=1e-2): | |
self.min_ct = min_ct | |
self.min_sigma = min_sigma | |
def fit(self, ts, t_pub): | |
"""Fit the long-term citation model | |
:param ts: Time of citation events | |
:type ts: numpy.ndarray | |
:param t_pub: Publication year | |
:type t_pub: float | |
:return: mu, sigma, eta, q | |
:rtype: _type_ | |
""" | |
dt = ts - t_pub | |
dt = dt[dt >= 0] | |
N = len(dt) # total number of citations | |
if N < self.min_ct: | |
return ( | |
np.nan, | |
np.nan, | |
np.nan, | |
np.nan, | |
) # special case: not enough citations to fit | |
tis = np.sort(dt) | |
tis = np.maximum(1e-5, tis) # avoid T=0 in logarithm | |
m = 30 # initial number of citations (can set to 1?) | |
# | |
# Optimize | |
# | |
params = {"tis": tis, "tis": tis, "m": m, "N": N} | |
obj = partial(self._calc_negative_loglikelihood, **params) | |
x0 = np.array([1, -1]) # initial | |
res = optimize.minimize( | |
obj, | |
x0, | |
method="nelder-mead", | |
options={"xatol": 1e-8, "disp": False}, | |
) | |
# | |
# Retrieve result | |
# | |
mu_opt, log_sig_opt = res.x | |
q = -self._calc_negative_loglikelihood(res.x, **params) | |
sig_opt = ( | |
np.exp(log_sig_opt) + self.min_sigma | |
) # transform sigma back to the normal scale | |
eta = self._calc_opt_eta(mu_opt, sig_opt, tis, N, m) # eq. (40) | |
return mu_opt, sig_opt, eta, q | |
def predict(self, t_pub, t_pred_start, t_pred_end, ct, eta, mu, sigma, m=30): | |
"""Predict citations | |
:param t_pub: Publication year | |
:type t_pub: float | |
:param t_pred_start: Starting time of the simulation | |
:type t_pred_start: float | |
:param t_pred_end: Ending time of the simulation | |
:type t_pred_end: float | |
:param ct: number of citations at the starting time of the simulation | |
:type ct: int | |
:param mu: mu parameter for the log-normal distribution | |
:type mu: float | |
:param sigma: concentration parameter for the log-normal distribution | |
:type sigma: float | |
:param eta: Fitness | |
:type eta: float | |
:param m: Initial attractiveness, defaults to 30 | |
:type m: int, optional | |
:return: _description_ | |
:rtype: _type_ | |
:param t_pub: Publication year | |
:type t_pub: numpy.ndarray | |
:param t_pred_start: Starting time of the simulation | |
:type t_pred_start: float | |
:return: Predicted number of citations and timestamps | |
:rtype: pred_ct: (num_nodes, num_time) sparse.csr_matrix, timestamp: (num_time), numpy.ndarray | |
""" | |
dts_pred = self.simulate_poisson_process_LTCM( | |
eta=eta, | |
log_mu=mu, | |
sigma=sigma, | |
m=m, | |
t_end=t_pred_end - t_pub, | |
t_start=t_pred_start - t_pub, | |
ct=ct, | |
) | |
return dts_pred + t_pub | |
def _calc_opt_eta(self, mu, sig, tis, N, m): | |
""" "Calculate the optimal Lagurange param. lambda using eq. 40. | |
:param mu: mu | |
:type mu: flaot | |
:param sig: sigma | |
:type sig: float | |
:param tis: Rescaled dates when the case got cited | |
:type tis: pd.DateTime | |
:param N: total number of citations | |
:type N: int | |
:param m: initial number of citations | |
:type m: int | |
:return: optimal param value | |
:rtype: float | |
""" | |
Phi = stats.norm.cdf # equation (21) in [1] | |
Log = lambda x: np.log( | |
np.maximum(1e-9, x) | |
) # regularized version of log, to avoid log(0) | |
lam_lkl = 1 / ( | |
(1 + m / N) - sum(Phi((Log(tis) - mu) / sig)) / N | |
) # Assume that T = \infty | |
return lam_lkl | |
def _calc_negative_loglikelihood(self, x, tis, N, m): | |
""" "implement equation (38) in [1] | |
:param x: (mu, log(sig)). Notice that log is applied to sig, which is to ensure sig to be positive | |
:type x: np.ndarray | |
:param tis: Rescaled dates when the case got cited | |
:type tis: pd.DateTime | |
:param N: total number of citations | |
:type N: int | |
:param m: initial number of citations | |
:type m: int | |
:return: negative log likelihood | |
:rtype: float | |
""" | |
mu, sig = x[0], np.exp(x[1]) | |
sig += self.min_sigma | |
# Pre-define functions | |
Phi = stats.norm.cdf # equation (21) in [1] | |
P = lambda t, mu, sig: stats.norm.pdf(np.log(t), mu, sig) | |
Log = lambda x: np.log( | |
np.maximum(1e-9, x) | |
) # regularized version of log, to avoid log(0) | |
lam_lkl = self._calc_opt_eta(mu, sig, tis, N, m) # eq. (40) | |
lkl1 = Log(lam_lkl) # term 1 in eq (38) in [1] | |
lkl2 = ( | |
np.sum(Log(P(tis, mu, sig)) + lam_lkl * Phi((Log(tis) - mu) / sig)) / N | |
) # term 2 | |
lkl3 = lam_lkl * (1 + m / N) # term 3. Assume that T = \infty | |
lkl = lkl1 + lkl2 - lkl3 | |
return -lkl | |
def simulate_poisson_process_LTCM( | |
self, eta, log_mu, sigma, m, t_end, t_start=0, ct=0 | |
): | |
"""Simulate the long-term citation model. | |
We use a rejection sampling called the thinning | |
method to simulate the inhomogeneous Poisson process: | |
https://www.math.fsu.edu/~ychen/research/Thinning%20algorithm.pdf | |
""" | |
# ct = 0 | |
sm = t_start | |
epsilon = 1e-12 | |
S = lambda t: np.exp( | |
-((np.log(t + epsilon) - log_mu) ** 2) / (2 * sigma**2) | |
) / ((t + epsilon) * np.sqrt(2 * sigma * np.pi)) | |
ts_list = [] | |
while sm < t_end: | |
Smax = 1 / ((sm + epsilon) * np.sqrt(2 * sigma * np.pi)) | |
lam_max = eta * (ct + m) * Smax | |
u = np.random.rand() | |
w = -np.log(u) / lam_max | |
sm += w | |
if sm > t_end: | |
break | |
u2 = np.random.rand() | |
if u2 < (eta * (ct + m) * S(sm) / lam_max): | |
ts_list.append(sm) | |
ct += 1 | |
return ts_list |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment