Skip to content

Instantly share code, notes, and snippets.

@skojaku
Last active November 29, 2022 01:02
Show Gist options
  • Save skojaku/cee26755645b133a69d6630c79307cde to your computer and use it in GitHub Desktop.
Save skojaku/cee26755645b133a69d6630c79307cde to your computer and use it in GitHub Desktop.
Fast python code for calculating disruption index
import numpy as np
from scipy import sparse
from tqdm.auto import tqdm
def calc_disruption_index(net, batch_size=None):
"""Calculate the Disruption index given by
DI = (NF - NB) / (NR + NB + NF),
where
- NF: Number of papers citing a focal paper BUT NOT citing any of the reference of the focal paper
- NB: Number of papers citing a focal paper AND citing at least one reference of the focal paper
- NR: Number of papers not citing a focal paper but citing at least one reference of the focal paper
net: sparse scipy matrix of a citation network. net[i,j] = 1 if i cites j.
params: sparse.csr_matrix
batch_size: batch size, default to None. Setting a larger batch_size makes computation faster, at the expense of memory. If None, the batch_size is set to the maximum.
params: None or int
Reference:
- Funk, R. J. & Owen-Smith, J. A dynamic network measure of technological change. Manage. Sci. 63, 791–817 (2017).
- Wu, L., Wang, D. & Evans, J.A. Large teams develop and small teams disrupt science and technology. Nature 566, 378–382 (2019). https://doi.org/10.1038/s41586-019-0941-9
http://russellfunk.org/cdindex/static/funk_ms_2016.pdf
"""
if batch_size is None:
return _calc_disruption_index(net)
# Homogenize the input data type
net = sparse.csr_matrix(net)
n_nodes = net.shape[0]
n_chunks = int(n_nodes / batch_size)
chunks = np.array_split(np.arange(n_nodes).astype(int), n_chunks)
DI = np.zeros(n_nodes)
netT = sparse.csr_matrix(net.T)
for focal_node_ids in tqdm(chunks):
is_relevant = (
np.array(net[focal_node_ids, :].sum(axis=0)).reshape(-1)
+ np.array(net[:, focal_node_ids].sum(axis=1)).reshape(-1)
+ np.array((net[focal_node_ids, :] @ netT).sum(axis=0)).reshape(-1)
)
is_relevant[focal_node_ids] = -1
supp_node_ids = np.where(is_relevant > 0)[0]
node_ids = np.concatenate([focal_node_ids, supp_node_ids])
subnet = net[node_ids, :][:, node_ids].copy()
subnet.sort_indices()
dindex = _calc_disruption_index(subnet)
DI[focal_node_ids] = dindex[: len(focal_node_ids)]
return DI
def _calc_disruption_index(net):
"""Calculate the Disruption index given by
DI = (NF - NB) / (NR + NB + NF),
where
- NF: Number of papers citing a focal paper BUT NOT citing any of the reference of the focal paper
- NB: Number of papers citing a focal paper AND citing at least one reference of the focal paper
- NR: Number of papers not citing a focal paper but citing at least one reference of the focal paper
net: sparse scipy matrix of a citation network. net[i,j] = 1 if i cites j.
params: sparse.csr_matrix
Reference:
- Funk, R. J. & Owen-Smith, J. A dynamic network measure of technological change. Manage. Sci. 63, 791–817 (2017).
- Wu, L., Wang, D. & Evans, J.A. Large teams develop and small teams disrupt science and technology. Nature 566, 378–382 (2019). https://doi.org/10.1038/s41586-019-0941-9
http://russellfunk.org/cdindex/static/funk_ms_2016.pdf
"""
# Homogenize the input data type
net = sparse.csr_matrix(net)
net.data = net.data * 0 + 1
#
# Calculate quantities for calculating the disruption index
#
# AAT[i,j] = 1 if papers i and j cocite at least one paper.
AAT = net @ net.T
AAT.data = np.ones_like(AAT.data)
AAT.setdiag(0)
AAT.eliminate_zeros()
# AT[i,j] = 1 if papers i is cited by paper j.
AT = sparse.csr_matrix(net.copy().T)
AT.data = np.ones_like(AT.data)
# NB[i,j] = AT[i,j] * AAT[i,j], which means
# NB[i,j] = 1 if i is cited by j, and i and j co-cite at least one paper
NB = AT.multiply(AAT)
# NF[i,j] = 1 if AT[i,j] = 1 and NB[i,j] = 0, which means
# j is a paper citing a focal paper i but not cociting i's reference
NF = AT - NB
# AAT[i,j] = 1 if AAT[i,j] - NB[i,j], which means
# i and j are co-citing at least one paper and but j does not cite i
NR = AAT - NB
# Calculate the disruption
DI = (NF.sum(axis=1) - NB.sum(axis=1)) / np.maximum(
NR.sum(axis=1) + NB.sum(axis=1) + NF.sum(axis=1), 1
)
DI = np.array(DI).reshape(-1)
return DI
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment