Created
November 18, 2024 00:22
-
-
Save luispedro/b5c4a9a3dd71a54595209656861734e3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%matplotlib qt | |
from scipy.stats import pearsonr,spearmanr | |
from scipy import stats | |
from matplotlib import pyplot as plt | |
import pandas as pd | |
import numpy as np | |
import seaborn as sns | |
data = pd.read_excel('./062109-1.xlsx', sheet_name=2) | |
data = data[data.columns[:12]] | |
data.set_index(data.columns[0], inplace=True) | |
data.fillna(0, inplace=True) | |
citations = data.reset_index()['Citations'].replace('100+', 100).astype(int) | |
jif_est = {} | |
for c in data.columns: | |
jif_est[c] = (data[c].values*citations).sum()/data[c].sum() | |
cumdata = data.cumsum(axis=0) | |
def random_select(col): | |
p = np.random.randint(cumdata[col].iloc[-1]) | |
return np.searchsorted(cumdata[col], p) | |
def pair_compare(col1, col2): | |
c1 = random_select(col1) | |
c2 = random_select(col2) | |
return c1 < c2 | |
def pair_compare_many(col1, col2, n): | |
return sum(pair_compare(col1, col2) for _ in range(n)) / n | |
low = [c for c in data.columns if jif_est[c] < 5] | |
medium = [c for c in data.columns if 5 <= jif_est[c] < 15] | |
high = [c for c in data.columns if 15 <= jif_est[c]] | |
compares = [] | |
for ix,c1 in enumerate(data.columns): | |
for c2 in data.columns[ix+1:]: | |
compares.append((c1, c2, pair_compare_many(c1, c2, 10_000))) | |
compares = pd.DataFrame(compares, columns=['Journal 1', 'Journal 2', 'P(J1->citations)']) | |
print(compares) | |
def select_category(cat): | |
if cat == 'low': | |
return np.random.choice(low) | |
elif cat == 'medium': | |
return np.random.choice(medium) | |
elif cat == 'high': | |
return np.random.choice(high) | |
else: | |
raise ValueError(f'Unknown category {cat}') | |
def compare_category(cat1, cat2): | |
return pair_compare_many(select_category(cat1), select_category(cat2), 10_000) | |
cat_comps = [] | |
for c1 in ['low', 'medium', 'high']: | |
for c2 in ['low', 'medium', 'high']: | |
cat_comps.append((c1, c2, compare_category(c1, c2))) | |
cat_comps = pd.DataFrame(cat_comps, columns=['Category 1', 'Category 2', 'P(JIF->citations)']) | |
print(cat_comps) | |
jif_prob = [] | |
for ix, c1 in enumerate(data.columns): | |
for c2 in data.columns[ix+1:]: | |
jif_prob.append((jif_est[c2] - jif_est[c1], pair_compare_many(c1, c2, 10_000))) | |
jif_prob = np.array(jif_prob) | |
np.corrcoef(jif_prob[:,0], jif_prob[:,1]) | |
fig, ax = plt.subplots() | |
ax.scatter(jif_prob[:,0], jif_prob[:,1]) | |
ax.set_xlabel('JIF difference (estimated)') | |
ax.set_ylabel('P(random paper in higher JIF journal has more citations)') | |
sns.despine(fig, trim=True) | |
fig.tight_layout() | |
fig.savefig('jif_prob.png') | |
fig.savefig('jif_prob.svg') | |
sanity = [] | |
for c in data.columns: | |
simul = [random_select(c) for _ in range(10_000)] | |
sanity.append((c, jif_est[c], np.mean(simul), np.median(simul))) | |
sanity = pd.DataFrame(sanity, columns=['Journal', 'JIF estimate', 'Random citation count', 'Median citation count']) | |
print(sanity) | |
stats.spearmanr(sanity['Median citation count'], sanity['Random citation count']) | |
jif_cites = [] | |
for c in data.columns: | |
for _ in range(100): | |
jif_cites.append((c, jif_est[c], random_select(c))) | |
jif_cites = pd.DataFrame(jif_cites, columns=['Journal', 'JIF estimate', 'Citations']) | |
spearmanr(jif_cites['JIF estimate'], jif_cites['Citations']) | |
fig, ax = plt.subplots() | |
ax.clear() | |
ax.scatter(jif_cites['JIF estimate'], jif_cites['Citations'], alpha=0.1) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment