Skip to content

Instantly share code, notes, and snippets.

@luispedro
Created November 18, 2024 00:22
Show Gist options
  • Save luispedro/b5c4a9a3dd71a54595209656861734e3 to your computer and use it in GitHub Desktop.
Save luispedro/b5c4a9a3dd71a54595209656861734e3 to your computer and use it in GitHub Desktop.
%matplotlib qt
from scipy.stats import pearsonr,spearmanr
from scipy import stats
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
data = pd.read_excel('./062109-1.xlsx', sheet_name=2)
data = data[data.columns[:12]]
data.set_index(data.columns[0], inplace=True)
data.fillna(0, inplace=True)
citations = data.reset_index()['Citations'].replace('100+', 100).astype(int)
jif_est = {}
for c in data.columns:
jif_est[c] = (data[c].values*citations).sum()/data[c].sum()
cumdata = data.cumsum(axis=0)
def random_select(col):
p = np.random.randint(cumdata[col].iloc[-1])
return np.searchsorted(cumdata[col], p)
def pair_compare(col1, col2):
c1 = random_select(col1)
c2 = random_select(col2)
return c1 < c2
def pair_compare_many(col1, col2, n):
return sum(pair_compare(col1, col2) for _ in range(n)) / n
low = [c for c in data.columns if jif_est[c] < 5]
medium = [c for c in data.columns if 5 <= jif_est[c] < 15]
high = [c for c in data.columns if 15 <= jif_est[c]]
compares = []
for ix,c1 in enumerate(data.columns):
for c2 in data.columns[ix+1:]:
compares.append((c1, c2, pair_compare_many(c1, c2, 10_000)))
compares = pd.DataFrame(compares, columns=['Journal 1', 'Journal 2', 'P(J1->citations)'])
print(compares)
def select_category(cat):
if cat == 'low':
return np.random.choice(low)
elif cat == 'medium':
return np.random.choice(medium)
elif cat == 'high':
return np.random.choice(high)
else:
raise ValueError(f'Unknown category {cat}')
def compare_category(cat1, cat2):
return pair_compare_many(select_category(cat1), select_category(cat2), 10_000)
cat_comps = []
for c1 in ['low', 'medium', 'high']:
for c2 in ['low', 'medium', 'high']:
cat_comps.append((c1, c2, compare_category(c1, c2)))
cat_comps = pd.DataFrame(cat_comps, columns=['Category 1', 'Category 2', 'P(JIF->citations)'])
print(cat_comps)
jif_prob = []
for ix, c1 in enumerate(data.columns):
for c2 in data.columns[ix+1:]:
jif_prob.append((jif_est[c2] - jif_est[c1], pair_compare_many(c1, c2, 10_000)))
jif_prob = np.array(jif_prob)
np.corrcoef(jif_prob[:,0], jif_prob[:,1])
fig, ax = plt.subplots()
ax.scatter(jif_prob[:,0], jif_prob[:,1])
ax.set_xlabel('JIF difference (estimated)')
ax.set_ylabel('P(random paper in higher JIF journal has more citations)')
sns.despine(fig, trim=True)
fig.tight_layout()
fig.savefig('jif_prob.png')
fig.savefig('jif_prob.svg')
sanity = []
for c in data.columns:
simul = [random_select(c) for _ in range(10_000)]
sanity.append((c, jif_est[c], np.mean(simul), np.median(simul)))
sanity = pd.DataFrame(sanity, columns=['Journal', 'JIF estimate', 'Random citation count', 'Median citation count'])
print(sanity)
stats.spearmanr(sanity['Median citation count'], sanity['Random citation count'])
jif_cites = []
for c in data.columns:
for _ in range(100):
jif_cites.append((c, jif_est[c], random_select(c)))
jif_cites = pd.DataFrame(jif_cites, columns=['Journal', 'JIF estimate', 'Citations'])
spearmanr(jif_cites['JIF estimate'], jif_cites['Citations'])
fig, ax = plt.subplots()
ax.clear()
ax.scatter(jif_cites['JIF estimate'], jif_cites['Citations'], alpha=0.1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment