skojaku/long_term_citation_model.py

## long_term_citation_model.py
# -*- coding: utf-8 -*-
# @Author: Sadamori Kojaku
# @Date:   2022-11-22 21:18:08
# @Last Modified by:   Sadamori Kojaku
# @Last Modified time: 2022-11-24 06:17:51
from functools import partial
import numpy as np
import pandas as pd
from scipy import optimize, sparse, stats
from tqdm import tqdm


class LongTermCitationModel:
    """Python implementation of the long-term citation model.

    Usage:
    >>A = sparse.csr_matrix(citation) # citation network, where A[i,j]=1 indicates a citation from paper i to paper j.
    >>model = LongTermCitationModel(min_ct=10) # create the model
    >>model.fit(A, t_pubs) # fitting. t_pubs is a numpy array of publication years
    >>print(model.mu, model.sigma) # Print the estimated parameters
    >>model.predict(A, t_pubs, 2010, 2020, m=30) # predict citation count from 2010 to 2020

    Note:
    While the original implementation is based on a grid search,
    I used a gradient descent algorithm to fit this model.
    Having cross-checked the log-likelihood function, the gradient descent
    algorithm yields a better fit.

    References:
    1. Wang, Dashun, Chaoming Song, and Albert-László Barabási. “Quantifying Long-Term Scientific Impact.” Science 342, no. 6154 (October 4, 2013): 127–32. https://doi.org/10.1126/science.1237825.
    2. 2015 - Ke - Qualifying Exam Report, Part 1: On the universal rescaled citation dynamics of individual papers
    """

    def __init__(self, min_ct=10, min_sigma=1e-2):
        """LongTermCitation Model

        :param min_ct: minimum number of citations needed to make reliable inference, defaults to 10
        :type min_ct: int, optional
        :param min_sigma: lower bound of the concentration parameter to prevent divergence, defaults to 1e-2
        :type min_sigma: float, optional
        """
        self.ltcm = LongTermCitationModelForPaper(min_ct=min_ct, min_sigma=min_sigma)
        self.mu = None
        self.sigma = None
        self.eta = None
        self.q = None

    def fit(self, net, t_pub, **params):
        """_summary_

        :param net: Citation network. net[i,j] = 1 indicates a citation from i to j.
        :type net: sparse.csr_matrix
        :param t_pub: t_pub[i] indicates the publication year of paper i
        :type t_pub: numpy.narray
        :return: self
        :rtype: self
        """
        n_nodes = net.shape[0]

        src, trg, _ = sparse.find(net)
        dt = t_pub[src] - t_pub[trg]
        dt = np.round(dt).astype(int)
        s = dt >= 0
        src, trg, dt = src[s], trg[s], dt[s]

        # Citation event time matrix in csr_matrix format
        # This is to exploit the csr_matrx structure to speed up the
        # access to the precomputed paper age sequence. Sepcifically,
        # - ET.data[ET.indptr[i]:ET.indptr[i+1]] will give the sequence of paper ages when cited.
        # - ET.indices[ET.indptr[i]:ET.indptr[i+1]] will give the sequence of papers that cite i.
        ET = sparse.csr_matrix((dt, (trg, src)), shape=net.shape)

        #
        # Main
        #
        mu, sigma, eta, q = (
            np.zeros(n_nodes),
            np.zeros(n_nodes),
            np.zeros(n_nodes),
            np.zeros(n_nodes),
        )
        for i in tqdm(range(n_nodes)):
            dt = ET.data[ET.indptr[i] : ET.indptr[i + 1]]
            mu[i], sigma[i], eta[i], q[i] = self.ltcm.fit(dt, t_pub=0)

        self.mu = mu
        self.sigma = sigma
        self.eta = eta
        self.q = q
        return self

    def predict(self, net, t_pub, t_pred_start, t_pred_end, m=30, **params):
        """Predict citations by the long-term citation model

        :param net: network
        :type net: scipy.sparse
        :param t_pub: Publication year
        :type t_pub: numpy.ndarray
        :param t_pred_start: Starting time of the simulation
        :type t_pred_start: float
        :param t_pred_end: Ending time of the simulation
        :type t_pred_end: float
        :param mu: mu parameter for the log-normal distribution
        :type mu: numpy.ndarray
        :param sigma: concentration parameter for the log-normal distribution
        :type sigma: numpy.ndarray
        :param eta: Fitness
        :type eta: numpy.ndarray
        :param m: Initial attractiveness, defaults to 30
        :type m: int, optional
        :return: Predicted number of citations and timestamps
        :rtype: pred_ct: (num_nodes, num_time) sparse.csr_matrix, timestamp: (num_time), numpy.ndarray
        """
        ct = np.array(net.sum(axis=0)).reshape(-1)

        n_nodes = net.shape[0]
        cited, t_cited = [], []
        for i in range(n_nodes):
            if pd.isna(self.mu[i]):
                continue

            dt_pred = self.ltcm.predict(
                t_pub=t_pub[i],
                t_pred_start=t_pred_start,
                t_pred_end=t_pred_end,
                ct=ct[i],
                eta=self.eta[i],
                mu=self.mu[i],
                sigma=self.sigma[i],
                m=m,
            )
            if len(dt_pred) == 0:
                continue
            dt_pred = np.round(dt_pred).astype(int)
            t_cited.append(dt_pred)
            cited.append(np.ones_like(dt_pred) * i)

        n_t = int(t_pred_end - t_pred_start) + 1
        timestamp = np.arange(int(t_pred_start), int(t_pred_end) + 1)
        if len(cited) == 0:

            pred_ct = sparse.csr_matrix(
                ([0], (0, 0)),
                shape=(n_nodes, n_t),
            )
            return pred_ct, timestamp
        cited = np.concatenate(cited)
        t_cited = np.concatenate(t_cited)

        # ut_cited, tids = np.unique(t_cited, return_inverse=True)
        tids = (t_cited - t_pred_start).astype(int)
        # n_t = len(ut_cited)
        pred_ct = sparse.csr_matrix(
            (np.ones_like(tids), (cited, tids)),
            shape=(n_nodes, n_t),
        )
        return pred_ct, timestamp


class LongTermCitationModelForPaper:
    """Python implementation of the long-term citation model.

    References:
    - https://www.science.org/doi/full/10.1126/science.1237825
    - 2015 - Ke - Qualifying Exam Report, Part 1: On the universal rescaled citation dynamics of individual papers
    """

    def __init__(self, min_ct=10, min_sigma=1e-2):
        self.min_ct = min_ct
        self.min_sigma = min_sigma

    def fit(self, ts, t_pub):
        """Fit the long-term citation model

        :param ts: Time of citation events
        :type ts: numpy.ndarray
        :param t_pub: Publication year
        :type t_pub: float
        :return: mu, sigma, eta, q
        :rtype: _type_
        """

        dt = ts - t_pub
        dt = dt[dt >= 0]
        N = len(dt)  # total number of citations
        if N < self.min_ct:
            return (
                np.nan,
                np.nan,
                np.nan,
                np.nan,
            )  # special case: not enough citations to fit

        tis = np.sort(dt)
        tis = np.maximum(1e-5, tis)  # avoid T=0 in logarithm
        m = 30  # initial number of citations (can set to 1?)

        #
        # Optimize
        #
        params = {"tis": tis, "tis": tis, "m": m, "N": N}
        obj = partial(self._calc_negative_loglikelihood, **params)

        x0 = np.array([1, -1])  # initial
        res = optimize.minimize(
            obj,
            x0,
            method="nelder-mead",
            options={"xatol": 1e-8, "disp": False},
        )

        #
        # Retrieve result
        #
        mu_opt, log_sig_opt = res.x
        q = -self._calc_negative_loglikelihood(res.x, **params)
        sig_opt = (
            np.exp(log_sig_opt) + self.min_sigma
        )  # transform sigma back to the normal scale
        eta = self._calc_opt_eta(mu_opt, sig_opt, tis, N, m)  # eq. (40)
        return mu_opt, sig_opt, eta, q

    def predict(self, t_pub, t_pred_start, t_pred_end, ct, eta, mu, sigma, m=30):
        """Predict citations

        :param t_pub: Publication year
        :type t_pub: float
        :param t_pred_start: Starting time of the simulation
        :type t_pred_start: float
        :param t_pred_end: Ending time of the simulation
        :type t_pred_end: float
        :param ct: number of citations at the starting time of the simulation
        :type ct: int
        :param mu: mu parameter for the log-normal distribution
        :type mu: float
        :param sigma: concentration parameter for the log-normal distribution
        :type sigma: float
        :param eta: Fitness
        :type eta: float
        :param m: Initial attractiveness, defaults to 30
        :type m: int, optional
        :return: _description_
        :rtype: _type_

        :param t_pub: Publication year
        :type t_pub: numpy.ndarray
        :param t_pred_start: Starting time of the simulation
        :type t_pred_start: float
        :return: Predicted number of citations and timestamps
        :rtype: pred_ct: (num_nodes, num_time) sparse.csr_matrix, timestamp: (num_time), numpy.ndarray
        """
        dts_pred = self.simulate_poisson_process_LTCM(
            eta=eta,
            log_mu=mu,
            sigma=sigma,
            m=m,
            t_end=t_pred_end - t_pub,
            t_start=t_pred_start - t_pub,
            ct=ct,
        )
        return dts_pred + t_pub

    def _calc_opt_eta(self, mu, sig, tis, N, m):
        """ "Calculate the optimal Lagurange param. lambda using eq. 40.

        :param mu: mu
        :type mu: flaot
        :param sig: sigma
        :type sig: float
        :param tis: Rescaled dates when the case got cited
        :type tis: pd.DateTime
        :param N: total number of citations
        :type N: int
        :param m: initial number of citations
        :type m: int
        :return: optimal param value
        :rtype: float
        """
        Phi = stats.norm.cdf  # equation (21) in [1]
        Log = lambda x: np.log(
            np.maximum(1e-9, x)
        )  # regularized version of log, to avoid log(0)
        lam_lkl = 1 / (
            (1 + m / N) - sum(Phi((Log(tis) - mu) / sig)) / N
        )  # Assume that T = \infty
        return lam_lkl

    def _calc_negative_loglikelihood(self, x, tis, N, m):
        """ "implement equation (38) in [1]

        :param x: (mu, log(sig)). Notice that log is applied to sig, which is to ensure sig to be positive
        :type x: np.ndarray
        :param tis: Rescaled dates when the case got cited
        :type tis: pd.DateTime
        :param N: total number of citations
        :type N: int
        :param m: initial number of citations
        :type m: int
        :return: negative log likelihood
        :rtype: float
        """

        mu, sig = x[0], np.exp(x[1])
        sig += self.min_sigma

        # Pre-define functions
        Phi = stats.norm.cdf  # equation (21) in [1]
        P = lambda t, mu, sig: stats.norm.pdf(np.log(t), mu, sig)
        Log = lambda x: np.log(
            np.maximum(1e-9, x)
        )  # regularized version of log, to avoid log(0)

        lam_lkl = self._calc_opt_eta(mu, sig, tis, N, m)  # eq. (40)

        lkl1 = Log(lam_lkl)  # term 1 in eq (38) in [1]
        lkl2 = (
            np.sum(Log(P(tis, mu, sig)) + lam_lkl * Phi((Log(tis) - mu) / sig)) / N
        )  # term 2
        lkl3 = lam_lkl * (1 + m / N)  # term 3.  Assume that T = \infty
        lkl = lkl1 + lkl2 - lkl3
        return -lkl

    def simulate_poisson_process_LTCM(
        self, eta, log_mu, sigma, m, t_end, t_start=0, ct=0
    ):
        """Simulate the long-term citation model.

        We use a rejection sampling called the thinning
        method to simulate the inhomogeneous Poisson process:
        https://www.math.fsu.edu/~ychen/research/Thinning%20algorithm.pdf
        """
        # ct = 0
        sm = t_start
        epsilon = 1e-12
        S = lambda t: np.exp(
            -((np.log(t + epsilon) - log_mu) ** 2) / (2 * sigma**2)
        ) / ((t + epsilon) * np.sqrt(2 * sigma * np.pi))
        ts_list = []
        while sm < t_end:
            Smax = 1 / ((sm + epsilon) * np.sqrt(2 * sigma * np.pi))
            lam_max = eta * (ct + m) * Smax
            u = np.random.rand()
            w = -np.log(u) / lam_max
            sm += w

            if sm > t_end:
                break

            u2 = np.random.rand()
            if u2 < (eta * (ct + m) * S(sm) / lam_max):
                ts_list.append(sm)
                ct += 1
        return ts_list
	# -- coding: utf-8 --
	# @Author: Sadamori Kojaku
	# @Date: 2022-11-22 21:18:08
	# @Last Modified by: Sadamori Kojaku
	# @Last Modified time: 2022-11-24 06:17:51
	from functools import partial
	import numpy as np
	import pandas as pd
	from scipy import optimize, sparse, stats
	from tqdm import tqdm


	class LongTermCitationModel:
	"""Python implementation of the long-term citation model.

	Usage:
	>>A = sparse.csr_matrix(citation) # citation network, where A[i,j]=1 indicates a citation from paper i to paper j.
	>>model = LongTermCitationModel(min_ct=10) # create the model
	>>model.fit(A, t_pubs) # fitting. t_pubs is a numpy array of publication years
	>>print(model.mu, model.sigma) # Print the estimated parameters
	>>model.predict(A, t_pubs, 2010, 2020, m=30) # predict citation count from 2010 to 2020

	Note:
	While the original implementation is based on a grid search,
	I used a gradient descent algorithm to fit this model.
	Having cross-checked the log-likelihood function, the gradient descent
	algorithm yields a better fit.

	References:
	1. Wang, Dashun, Chaoming Song, and Albert-László Barabási. “Quantifying Long-Term Scientific Impact.” Science 342, no. 6154 (October 4, 2013): 127–32. https://doi.org/10.1126/science.1237825.
	2. 2015 - Ke - Qualifying Exam Report, Part 1: On the universal rescaled citation dynamics of individual papers
	"""

	def __init__(self, min_ct=10, min_sigma=1e-2):
	"""LongTermCitation Model

	:param min_ct: minimum number of citations needed to make reliable inference, defaults to 10
	:type min_ct: int, optional
	:param min_sigma: lower bound of the concentration parameter to prevent divergence, defaults to 1e-2
	:type min_sigma: float, optional
	"""
	self.ltcm = LongTermCitationModelForPaper(min_ct=min_ct, min_sigma=min_sigma)
	self.mu = None
	self.sigma = None
	self.eta = None
	self.q = None

	def fit(self, net, t_pub, **params):
	"""_summary_

	:param net: Citation network. net[i,j] = 1 indicates a citation from i to j.
	:type net: sparse.csr_matrix
	:param t_pub: t_pub[i] indicates the publication year of paper i
	:type t_pub: numpy.narray
	:return: self
	:rtype: self
	"""
	n_nodes = net.shape[0]

	src, trg, _ = sparse.find(net)
	dt = t_pub[src] - t_pub[trg]
	dt = np.round(dt).astype(int)
	s = dt >= 0
	src, trg, dt = src[s], trg[s], dt[s]

	# Citation event time matrix in csr_matrix format
	# This is to exploit the csr_matrx structure to speed up the
	# access to the precomputed paper age sequence. Sepcifically,
	# - ET.data[ET.indptr[i]:ET.indptr[i+1]] will give the sequence of paper ages when cited.
	# - ET.indices[ET.indptr[i]:ET.indptr[i+1]] will give the sequence of papers that cite i.
	ET = sparse.csr_matrix((dt, (trg, src)), shape=net.shape)

	#
	# Main
	#
	mu, sigma, eta, q = (
	np.zeros(n_nodes),
	np.zeros(n_nodes),
	np.zeros(n_nodes),
	np.zeros(n_nodes),
	)
	for i in tqdm(range(n_nodes)):
	dt = ET.data[ET.indptr[i] : ET.indptr[i + 1]]
	mu[i], sigma[i], eta[i], q[i] = self.ltcm.fit(dt, t_pub=0)

	self.mu = mu
	self.sigma = sigma
	self.eta = eta
	self.q = q
	return self

	def predict(self, net, t_pub, t_pred_start, t_pred_end, m=30, **params):
	"""Predict citations by the long-term citation model

	:param net: network
	:type net: scipy.sparse
	:param t_pub: Publication year
	:type t_pub: numpy.ndarray
	:param t_pred_start: Starting time of the simulation
	:type t_pred_start: float
	:param t_pred_end: Ending time of the simulation
	:type t_pred_end: float
	:param mu: mu parameter for the log-normal distribution
	:type mu: numpy.ndarray
	:param sigma: concentration parameter for the log-normal distribution
	:type sigma: numpy.ndarray
	:param eta: Fitness
	:type eta: numpy.ndarray
	:param m: Initial attractiveness, defaults to 30
	:type m: int, optional
	:return: Predicted number of citations and timestamps
	:rtype: pred_ct: (num_nodes, num_time) sparse.csr_matrix, timestamp: (num_time), numpy.ndarray
	"""
	ct = np.array(net.sum(axis=0)).reshape(-1)

	n_nodes = net.shape[0]
	cited, t_cited = [], []
	for i in range(n_nodes):
	if pd.isna(self.mu[i]):
	continue

	dt_pred = self.ltcm.predict(
	t_pub=t_pub[i],
	t_pred_start=t_pred_start,
	t_pred_end=t_pred_end,
	ct=ct[i],
	eta=self.eta[i],
	mu=self.mu[i],
	sigma=self.sigma[i],
	m=m,
	)
	if len(dt_pred) == 0:
	continue
	dt_pred = np.round(dt_pred).astype(int)
	t_cited.append(dt_pred)
	cited.append(np.ones_like(dt_pred) * i)

	n_t = int(t_pred_end - t_pred_start) + 1
	timestamp = np.arange(int(t_pred_start), int(t_pred_end) + 1)
	if len(cited) == 0:

	pred_ct = sparse.csr_matrix(
	([0], (0, 0)),
	shape=(n_nodes, n_t),
	)
	return pred_ct, timestamp
	cited = np.concatenate(cited)
	t_cited = np.concatenate(t_cited)

	# ut_cited, tids = np.unique(t_cited, return_inverse=True)
	tids = (t_cited - t_pred_start).astype(int)
	# n_t = len(ut_cited)
	pred_ct = sparse.csr_matrix(
	(np.ones_like(tids), (cited, tids)),
	shape=(n_nodes, n_t),
	)
	return pred_ct, timestamp


	class LongTermCitationModelForPaper:
	"""Python implementation of the long-term citation model.

	References:
	- https://www.science.org/doi/full/10.1126/science.1237825
	- 2015 - Ke - Qualifying Exam Report, Part 1: On the universal rescaled citation dynamics of individual papers
	"""

	def __init__(self, min_ct=10, min_sigma=1e-2):
	self.min_ct = min_ct
	self.min_sigma = min_sigma

	def fit(self, ts, t_pub):
	"""Fit the long-term citation model

	:param ts: Time of citation events
	:type ts: numpy.ndarray
	:param t_pub: Publication year
	:type t_pub: float
	:return: mu, sigma, eta, q
	:rtype: _type_
	"""

	dt = ts - t_pub
	dt = dt[dt >= 0]
	N = len(dt) # total number of citations
	if N < self.min_ct:
	return (
	np.nan,
	np.nan,
	np.nan,
	np.nan,
	) # special case: not enough citations to fit

	tis = np.sort(dt)
	tis = np.maximum(1e-5, tis) # avoid T=0 in logarithm
	m = 30 # initial number of citations (can set to 1?)

	#
	# Optimize
	#
	params = {"tis": tis, "tis": tis, "m": m, "N": N}
	obj = partial(self._calc_negative_loglikelihood, **params)

	x0 = np.array([1, -1]) # initial
	res = optimize.minimize(
	obj,
	x0,
	method="nelder-mead",
	options={"xatol": 1e-8, "disp": False},
	)

	#
	# Retrieve result
	#
	mu_opt, log_sig_opt = res.x
	q = -self._calc_negative_loglikelihood(res.x, **params)
	sig_opt = (
	np.exp(log_sig_opt) + self.min_sigma
	) # transform sigma back to the normal scale
	eta = self._calc_opt_eta(mu_opt, sig_opt, tis, N, m) # eq. (40)
	return mu_opt, sig_opt, eta, q

	def predict(self, t_pub, t_pred_start, t_pred_end, ct, eta, mu, sigma, m=30):
	"""Predict citations

	:param t_pub: Publication year
	:type t_pub: float
	:param t_pred_start: Starting time of the simulation
	:type t_pred_start: float
	:param t_pred_end: Ending time of the simulation
	:type t_pred_end: float
	:param ct: number of citations at the starting time of the simulation
	:type ct: int
	:param mu: mu parameter for the log-normal distribution
	:type mu: float
	:param sigma: concentration parameter for the log-normal distribution
	:type sigma: float
	:param eta: Fitness
	:type eta: float
	:param m: Initial attractiveness, defaults to 30
	:type m: int, optional
	:return: _description_
	:rtype: _type_

	:param t_pub: Publication year
	:type t_pub: numpy.ndarray
	:param t_pred_start: Starting time of the simulation
	:type t_pred_start: float
	:return: Predicted number of citations and timestamps
	:rtype: pred_ct: (num_nodes, num_time) sparse.csr_matrix, timestamp: (num_time), numpy.ndarray
	"""
	dts_pred = self.simulate_poisson_process_LTCM(
	eta=eta,
	log_mu=mu,
	sigma=sigma,
	m=m,
	t_end=t_pred_end - t_pub,
	t_start=t_pred_start - t_pub,
	ct=ct,
	)
	return dts_pred + t_pub

	def _calc_opt_eta(self, mu, sig, tis, N, m):
	""" "Calculate the optimal Lagurange param. lambda using eq. 40.

	:param mu: mu
	:type mu: flaot
	:param sig: sigma
	:type sig: float
	:param tis: Rescaled dates when the case got cited
	:type tis: pd.DateTime
	:param N: total number of citations
	:type N: int
	:param m: initial number of citations
	:type m: int
	:return: optimal param value
	:rtype: float
	"""
	Phi = stats.norm.cdf # equation (21) in [1]
	Log = lambda x: np.log(
	np.maximum(1e-9, x)
	) # regularized version of log, to avoid log(0)
	lam_lkl = 1 / (
	(1 + m / N) - sum(Phi((Log(tis) - mu) / sig)) / N
	) # Assume that T = \infty
	return lam_lkl

	def _calc_negative_loglikelihood(self, x, tis, N, m):
	""" "implement equation (38) in [1]

	:param x: (mu, log(sig)). Notice that log is applied to sig, which is to ensure sig to be positive
	:type x: np.ndarray
	:param tis: Rescaled dates when the case got cited
	:type tis: pd.DateTime
	:param N: total number of citations
	:type N: int
	:param m: initial number of citations
	:type m: int
	:return: negative log likelihood
	:rtype: float
	"""

	mu, sig = x[0], np.exp(x[1])
	sig += self.min_sigma

	# Pre-define functions
	Phi = stats.norm.cdf # equation (21) in [1]
	P = lambda t, mu, sig: stats.norm.pdf(np.log(t), mu, sig)
	Log = lambda x: np.log(
	np.maximum(1e-9, x)
	) # regularized version of log, to avoid log(0)

	lam_lkl = self._calc_opt_eta(mu, sig, tis, N, m) # eq. (40)

	lkl1 = Log(lam_lkl) # term 1 in eq (38) in [1]
	lkl2 = (
	np.sum(Log(P(tis, mu, sig)) + lam_lkl * Phi((Log(tis) - mu) / sig)) / N
	) # term 2
	lkl3 = lam_lkl * (1 + m / N) # term 3. Assume that T = \infty
	lkl = lkl1 + lkl2 - lkl3
	return -lkl

	def simulate_poisson_process_LTCM(
	self, eta, log_mu, sigma, m, t_end, t_start=0, ct=0
	):
	"""Simulate the long-term citation model.

	We use a rejection sampling called the thinning
	method to simulate the inhomogeneous Poisson process:
	https://www.math.fsu.edu/~ychen/research/Thinning%20algorithm.pdf
	"""
	# ct = 0
	sm = t_start
	epsilon = 1e-12
	S = lambda t: np.exp(
	-((np.log(t + epsilon) - log_mu) ** 2) / (2 * sigma**2)
	) / ((t + epsilon) * np.sqrt(2 * sigma * np.pi))
	ts_list = []
	while sm < t_end:
	Smax = 1 / ((sm + epsilon) * np.sqrt(2 * sigma * np.pi))
	lam_max = eta * (ct + m) * Smax
	u = np.random.rand()
	w = -np.log(u) / lam_max
	sm += w

	if sm > t_end:
	break

	u2 = np.random.rand()
	if u2 < (eta * (ct + m) * S(sm) / lam_max):
	ts_list.append(sm)
	ct += 1
	return ts_list