Skip to content

Instantly share code, notes, and snippets.

@skojaku
Created November 24, 2022 11:17
Show Gist options
  • Save skojaku/8494552b3012d047f6555b5f322e3eaf to your computer and use it in GitHub Desktop.
Save skojaku/8494552b3012d047f6555b5f322e3eaf to your computer and use it in GitHub Desktop.
Long-term citation model
# -*- coding: utf-8 -*-
# @Author: Sadamori Kojaku
# @Date: 2022-11-22 21:18:08
# @Last Modified by: Sadamori Kojaku
# @Last Modified time: 2022-11-24 06:17:51
from functools import partial
import numpy as np
import pandas as pd
from scipy import optimize, sparse, stats
from tqdm import tqdm
class LongTermCitationModel:
"""Python implementation of the long-term citation model.
Usage:
>>A = sparse.csr_matrix(citation) # citation network, where A[i,j]=1 indicates a citation from paper i to paper j.
>>model = LongTermCitationModel(min_ct=10) # create the model
>>model.fit(A, t_pubs) # fitting. t_pubs is a numpy array of publication years
>>print(model.mu, model.sigma) # Print the estimated parameters
>>model.predict(A, t_pubs, 2010, 2020, m=30) # predict citation count from 2010 to 2020
Note:
While the original implementation is based on a grid search,
I used a gradient descent algorithm to fit this model.
Having cross-checked the log-likelihood function, the gradient descent
algorithm yields a better fit.
References:
1. Wang, Dashun, Chaoming Song, and Albert-László Barabási. “Quantifying Long-Term Scientific Impact.” Science 342, no. 6154 (October 4, 2013): 127–32. https://doi.org/10.1126/science.1237825.
2. 2015 - Ke - Qualifying Exam Report, Part 1: On the universal rescaled citation dynamics of individual papers
"""
def __init__(self, min_ct=10, min_sigma=1e-2):
"""LongTermCitation Model
:param min_ct: minimum number of citations needed to make reliable inference, defaults to 10
:type min_ct: int, optional
:param min_sigma: lower bound of the concentration parameter to prevent divergence, defaults to 1e-2
:type min_sigma: float, optional
"""
self.ltcm = LongTermCitationModelForPaper(min_ct=min_ct, min_sigma=min_sigma)
self.mu = None
self.sigma = None
self.eta = None
self.q = None
def fit(self, net, t_pub, **params):
"""_summary_
:param net: Citation network. net[i,j] = 1 indicates a citation from i to j.
:type net: sparse.csr_matrix
:param t_pub: t_pub[i] indicates the publication year of paper i
:type t_pub: numpy.narray
:return: self
:rtype: self
"""
n_nodes = net.shape[0]
src, trg, _ = sparse.find(net)
dt = t_pub[src] - t_pub[trg]
dt = np.round(dt).astype(int)
s = dt >= 0
src, trg, dt = src[s], trg[s], dt[s]
# Citation event time matrix in csr_matrix format
# This is to exploit the csr_matrx structure to speed up the
# access to the precomputed paper age sequence. Sepcifically,
# - ET.data[ET.indptr[i]:ET.indptr[i+1]] will give the sequence of paper ages when cited.
# - ET.indices[ET.indptr[i]:ET.indptr[i+1]] will give the sequence of papers that cite i.
ET = sparse.csr_matrix((dt, (trg, src)), shape=net.shape)
#
# Main
#
mu, sigma, eta, q = (
np.zeros(n_nodes),
np.zeros(n_nodes),
np.zeros(n_nodes),
np.zeros(n_nodes),
)
for i in tqdm(range(n_nodes)):
dt = ET.data[ET.indptr[i] : ET.indptr[i + 1]]
mu[i], sigma[i], eta[i], q[i] = self.ltcm.fit(dt, t_pub=0)
self.mu = mu
self.sigma = sigma
self.eta = eta
self.q = q
return self
def predict(self, net, t_pub, t_pred_start, t_pred_end, m=30, **params):
"""Predict citations by the long-term citation model
:param net: network
:type net: scipy.sparse
:param t_pub: Publication year
:type t_pub: numpy.ndarray
:param t_pred_start: Starting time of the simulation
:type t_pred_start: float
:param t_pred_end: Ending time of the simulation
:type t_pred_end: float
:param mu: mu parameter for the log-normal distribution
:type mu: numpy.ndarray
:param sigma: concentration parameter for the log-normal distribution
:type sigma: numpy.ndarray
:param eta: Fitness
:type eta: numpy.ndarray
:param m: Initial attractiveness, defaults to 30
:type m: int, optional
:return: Predicted number of citations and timestamps
:rtype: pred_ct: (num_nodes, num_time) sparse.csr_matrix, timestamp: (num_time), numpy.ndarray
"""
ct = np.array(net.sum(axis=0)).reshape(-1)
n_nodes = net.shape[0]
cited, t_cited = [], []
for i in range(n_nodes):
if pd.isna(self.mu[i]):
continue
dt_pred = self.ltcm.predict(
t_pub=t_pub[i],
t_pred_start=t_pred_start,
t_pred_end=t_pred_end,
ct=ct[i],
eta=self.eta[i],
mu=self.mu[i],
sigma=self.sigma[i],
m=m,
)
if len(dt_pred) == 0:
continue
dt_pred = np.round(dt_pred).astype(int)
t_cited.append(dt_pred)
cited.append(np.ones_like(dt_pred) * i)
n_t = int(t_pred_end - t_pred_start) + 1
timestamp = np.arange(int(t_pred_start), int(t_pred_end) + 1)
if len(cited) == 0:
pred_ct = sparse.csr_matrix(
([0], (0, 0)),
shape=(n_nodes, n_t),
)
return pred_ct, timestamp
cited = np.concatenate(cited)
t_cited = np.concatenate(t_cited)
# ut_cited, tids = np.unique(t_cited, return_inverse=True)
tids = (t_cited - t_pred_start).astype(int)
# n_t = len(ut_cited)
pred_ct = sparse.csr_matrix(
(np.ones_like(tids), (cited, tids)),
shape=(n_nodes, n_t),
)
return pred_ct, timestamp
class LongTermCitationModelForPaper:
"""Python implementation of the long-term citation model.
References:
- https://www.science.org/doi/full/10.1126/science.1237825
- 2015 - Ke - Qualifying Exam Report, Part 1: On the universal rescaled citation dynamics of individual papers
"""
def __init__(self, min_ct=10, min_sigma=1e-2):
self.min_ct = min_ct
self.min_sigma = min_sigma
def fit(self, ts, t_pub):
"""Fit the long-term citation model
:param ts: Time of citation events
:type ts: numpy.ndarray
:param t_pub: Publication year
:type t_pub: float
:return: mu, sigma, eta, q
:rtype: _type_
"""
dt = ts - t_pub
dt = dt[dt >= 0]
N = len(dt) # total number of citations
if N < self.min_ct:
return (
np.nan,
np.nan,
np.nan,
np.nan,
) # special case: not enough citations to fit
tis = np.sort(dt)
tis = np.maximum(1e-5, tis) # avoid T=0 in logarithm
m = 30 # initial number of citations (can set to 1?)
#
# Optimize
#
params = {"tis": tis, "tis": tis, "m": m, "N": N}
obj = partial(self._calc_negative_loglikelihood, **params)
x0 = np.array([1, -1]) # initial
res = optimize.minimize(
obj,
x0,
method="nelder-mead",
options={"xatol": 1e-8, "disp": False},
)
#
# Retrieve result
#
mu_opt, log_sig_opt = res.x
q = -self._calc_negative_loglikelihood(res.x, **params)
sig_opt = (
np.exp(log_sig_opt) + self.min_sigma
) # transform sigma back to the normal scale
eta = self._calc_opt_eta(mu_opt, sig_opt, tis, N, m) # eq. (40)
return mu_opt, sig_opt, eta, q
def predict(self, t_pub, t_pred_start, t_pred_end, ct, eta, mu, sigma, m=30):
"""Predict citations
:param t_pub: Publication year
:type t_pub: float
:param t_pred_start: Starting time of the simulation
:type t_pred_start: float
:param t_pred_end: Ending time of the simulation
:type t_pred_end: float
:param ct: number of citations at the starting time of the simulation
:type ct: int
:param mu: mu parameter for the log-normal distribution
:type mu: float
:param sigma: concentration parameter for the log-normal distribution
:type sigma: float
:param eta: Fitness
:type eta: float
:param m: Initial attractiveness, defaults to 30
:type m: int, optional
:return: _description_
:rtype: _type_
:param t_pub: Publication year
:type t_pub: numpy.ndarray
:param t_pred_start: Starting time of the simulation
:type t_pred_start: float
:return: Predicted number of citations and timestamps
:rtype: pred_ct: (num_nodes, num_time) sparse.csr_matrix, timestamp: (num_time), numpy.ndarray
"""
dts_pred = self.simulate_poisson_process_LTCM(
eta=eta,
log_mu=mu,
sigma=sigma,
m=m,
t_end=t_pred_end - t_pub,
t_start=t_pred_start - t_pub,
ct=ct,
)
return dts_pred + t_pub
def _calc_opt_eta(self, mu, sig, tis, N, m):
""" "Calculate the optimal Lagurange param. lambda using eq. 40.
:param mu: mu
:type mu: flaot
:param sig: sigma
:type sig: float
:param tis: Rescaled dates when the case got cited
:type tis: pd.DateTime
:param N: total number of citations
:type N: int
:param m: initial number of citations
:type m: int
:return: optimal param value
:rtype: float
"""
Phi = stats.norm.cdf # equation (21) in [1]
Log = lambda x: np.log(
np.maximum(1e-9, x)
) # regularized version of log, to avoid log(0)
lam_lkl = 1 / (
(1 + m / N) - sum(Phi((Log(tis) - mu) / sig)) / N
) # Assume that T = \infty
return lam_lkl
def _calc_negative_loglikelihood(self, x, tis, N, m):
""" "implement equation (38) in [1]
:param x: (mu, log(sig)). Notice that log is applied to sig, which is to ensure sig to be positive
:type x: np.ndarray
:param tis: Rescaled dates when the case got cited
:type tis: pd.DateTime
:param N: total number of citations
:type N: int
:param m: initial number of citations
:type m: int
:return: negative log likelihood
:rtype: float
"""
mu, sig = x[0], np.exp(x[1])
sig += self.min_sigma
# Pre-define functions
Phi = stats.norm.cdf # equation (21) in [1]
P = lambda t, mu, sig: stats.norm.pdf(np.log(t), mu, sig)
Log = lambda x: np.log(
np.maximum(1e-9, x)
) # regularized version of log, to avoid log(0)
lam_lkl = self._calc_opt_eta(mu, sig, tis, N, m) # eq. (40)
lkl1 = Log(lam_lkl) # term 1 in eq (38) in [1]
lkl2 = (
np.sum(Log(P(tis, mu, sig)) + lam_lkl * Phi((Log(tis) - mu) / sig)) / N
) # term 2
lkl3 = lam_lkl * (1 + m / N) # term 3. Assume that T = \infty
lkl = lkl1 + lkl2 - lkl3
return -lkl
def simulate_poisson_process_LTCM(
self, eta, log_mu, sigma, m, t_end, t_start=0, ct=0
):
"""Simulate the long-term citation model.
We use a rejection sampling called the thinning
method to simulate the inhomogeneous Poisson process:
https://www.math.fsu.edu/~ychen/research/Thinning%20algorithm.pdf
"""
# ct = 0
sm = t_start
epsilon = 1e-12
S = lambda t: np.exp(
-((np.log(t + epsilon) - log_mu) ** 2) / (2 * sigma**2)
) / ((t + epsilon) * np.sqrt(2 * sigma * np.pi))
ts_list = []
while sm < t_end:
Smax = 1 / ((sm + epsilon) * np.sqrt(2 * sigma * np.pi))
lam_max = eta * (ct + m) * Smax
u = np.random.rand()
w = -np.log(u) / lam_max
sm += w
if sm > t_end:
break
u2 = np.random.rand()
if u2 < (eta * (ct + m) * S(sm) / lam_max):
ts_list.append(sm)
ct += 1
return ts_list
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment