Analysis code for Paper on Women Publishing in Philosophy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re, os, itertools | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import pandas as pd | |
import numpy as np | |
import researchpy as rp | |
import statsmodels.api as sm | |
from scipy import stats | |
from statsmodels.formula.api import ols | |
from statsmodels.formula.api import glm | |
from matplotlib.lines import Line2D | |
from unicodedata import normalize | |
BASE_FOLDER = "data/" | |
IMAGES_FOLDER = "graphs/" | |
STATS_FOLDER = "stats/" | |
SUMMARIES_FOLDER = "summaries/" | |
TYPE_ORDER = ("Top-Philosophy","Non-Top Philosophy","Interdisciplinary") | |
REVIEW_ORDER = ("Non-Anonymous","Double Anonymous","Triple Anonymous") | |
AOS_ORDER = ("G","H", "LEMM","S","V") | |
pd.set_option('display.max_columns', None) | |
pd.set_option('expand_frame_repr', False) | |
plt.rcParams['figure.dpi'] = 300 | |
def calc(*argv): | |
for df in argv: | |
df["Proportion Female"] = (df["# Female"] )/ ( df["# Male"] + df["# Female"]) | |
#%% Load data | |
print("Reading files from '%s'"%BASE_FOLDER) | |
authors = pd.read_csv(BASE_FOLDER + 'authors.csv', low_memory=False, encoding="cp1252").drop_duplicates(subset="aID").set_index("aID").sort_index() | |
papers = pd.read_csv(BASE_FOLDER + 'papers.csv', low_memory=False, encoding="cp1252").drop_duplicates(subset="pID").set_index("pID").sort_index() | |
journals = pd.read_csv(BASE_FOLDER + 'journals_divided.csv', low_memory=False, encoding="cp1252") | |
editors = pd.read_csv(BASE_FOLDER + 'editors.csv', low_memory=False, encoding="cp1252") | |
surveys = pd.read_csv(BASE_FOLDER + 'surveys.csv', low_memory=False, encoding="cp1252") | |
west_all_fields = pd.read_csv(BASE_FOLDER + 'west_allfields.csv', low_memory=False, encoding="cp1252") | |
west_humanities = pd.read_csv(BASE_FOLDER + 'west_humanities.csv', low_memory=False, encoding="cp1252") | |
#%% Merge data | |
print("Processing Data...") | |
#Decode Journal Type | |
journals.loc[journals.Top == "Y","Type"] = "Top-Philosophy" | |
journals.loc[journals["Non Top"] == "Y","Type"] = "Non-Top Philosophy" | |
journals.loc[journals["Interdisciplinary"] == "Y","Type"] = "Interdisciplinary" | |
journals.drop(columns=["Top", "Non Top", "Interdisciplinary", "In Old Paper"], inplace=True) | |
#Rename review types | |
journals.loc[journals["Review type"] == "Not Blind","Review type"] = "Non-Anonymous" | |
journals.loc[journals["Review type"] == "Double Blind","Review type"] = "Double Anonymous" | |
journals.loc[journals["Review type"] == "Triple Blind","Review type"] = "Triple Anonymous" | |
#Pull out Gender between rows | |
authors = authors[authors.gender < 10] # Drop Unknown Gender | |
authors["# Male"]= authors.gender == 0 | |
authors["# Female"]= authors.gender == 1 | |
authors["Number Authors"] = 1 | |
#Make author names all upper case | |
authors.firstname=authors.firstname.str.upper() | |
authors.lastname=authors.lastname.str.upper() | |
#Drop unused columns & years | |
authors.drop(columns=["gender", "sequence"], inplace=True) | |
papers.drop(columns=["cluster", "jID","EF", "firstauthor"], inplace=True) | |
papers = papers[ (papers.year >= 1900) & (papers.year < 2010)] | |
#Create a decades variable | |
papers["Decade"] = (papers.year//10)*10 | |
west_all_fields["Decade"] = (west_all_fields.year//10)*10 | |
#Build full authors table | |
authors_by_pid = authors.groupby("pID").sum() | |
authors = authors.merge(papers, left_on="pID", right_index=True) | |
authors = authors.merge(journals, left_on="jtitle", right_on="jtitle") | |
#build full papers table | |
papers = papers.merge(authors_by_pid, left_index=True, right_index=True) | |
papers = papers.merge(journals, left_on="jtitle", right_on="jtitle") | |
papers[["# Male","# Female","Number Authors"]] = papers[["# Male","# Female","Number Authors"]].astype('int64') | |
#Create commonly used summary tables | |
journals_byYear = papers.groupby(["jtitle","year","Decade","Type","AOS","Review type"], as_index=False).sum() | |
journals_byDecade = papers.groupby(["jtitle","Decade","Type","AOS","Review type"], as_index=False).sum().drop(columns=["year"]) | |
journals_all = papers.groupby(["jtitle","Type","AOS","Review type"], as_index=False).sum().drop(columns=["year","Decade"]) | |
type_byDecade = papers.groupby(["Decade","Type"], as_index=False).sum().drop(columns=["year"]) | |
#Calculate Proportion Female for each Group | |
calc(journals_byYear, journals_all, journals_byDecade, type_byDecade) | |
#%% Save Summary file | |
print("Saving Data...") | |
if not os.path.exists(IMAGES_FOLDER): | |
os.makedirs(IMAGES_FOLDER) | |
if not os.path.exists(STATS_FOLDER): | |
os.makedirs(STATS_FOLDER) | |
if not os.path.exists(SUMMARIES_FOLDER): | |
os.makedirs(SUMMARIES_FOLDER) | |
authors.to_csv(SUMMARIES_FOLDER + "authors_all.csv") | |
papers.to_csv(SUMMARIES_FOLDER + "papers_all.csv") | |
journals_all.to_csv(SUMMARIES_FOLDER + "journal_all_1900to2010.csv") | |
journals_byYear.to_csv(SUMMARIES_FOLDER + "journal_by_year.csv") | |
journals_byDecade.to_csv(SUMMARIES_FOLDER + "journal_by_decade.csv") | |
type_byDecade.to_csv(SUMMARIES_FOLDER + "type_by_decade.csv") | |
#%% Look at unique authorship | |
unique_authors = authors.groupby(["firstname","lastname"]).sum() | |
unique_authors_m = unique_authors[unique_authors["# Male"] > 0] | |
unique_authors_f = unique_authors[unique_authors["# Female"] > 0] | |
#%% Generate Graphs | |
print("Saving Graphs...") | |
sns.set(rc={'figure.figsize':(9,5)}, font="Calibri") | |
sns.set_style("whitegrid", {'axes.grid' : False}) | |
sns.set_palette(sns.color_palette(["#2a70a2","#8abfdb","#c1daf0", "#701070"])) | |
#%% Function To Print Sample Size | |
def print_sample_size(ax, df_sample_size, order, hue_order=None, offset=0.27, x_adjust=0.0, y_adjust=1.07): | |
ax.text(-0.55, y_adjust, "n=", horizontalalignment='center', size='small', fontweight="bold") | |
if hue_order: | |
for c, g1 in enumerate(order): | |
x = c - offset + x_adjust | |
for g2 in hue_order: | |
s = df_sample_size[(g1, g2)] if (g1, g2) in df_sample_size else "" | |
ax.text(x, y_adjust, s, horizontalalignment='center', size='small') | |
x += offset | |
else: | |
for x, g1 in enumerate(order): | |
s = df_sample_size[g1] if g1 in df_sample_size else "" | |
ax.text(x, y_adjust, s, horizontalalignment='center', size='small') | |
#%% Plot Journal performance across time and in 2000s | |
dfs = [ journals_byDecade[(journals_byDecade.Type != "Interdisciplinary") & (journals_byDecade.Decade == 2000)].nlargest(10, "Proportion Female").sort_values( | |
by=['Proportion Female'], ascending=False), | |
journals_byDecade[(journals_byDecade.Type != "Interdisciplinary") & (journals_byDecade.Decade == 2000)].nsmallest(10, "Proportion Female").sort_values( | |
by=['Proportion Female'], ascending=False) | |
] | |
fig,axs = plt.subplots(2, sharex="all") | |
fig.set_size_inches(9,8) | |
for i in range(0,2): | |
# Hack to trim journal name for graph | |
dfs[i].loc[dfs[i].jtitle == "Erkenntnis (1975-)", "jtitle"] = "Erkenntnis" | |
dfs[i].loc[dfs[i].jtitle == "Apeiron: A Journal for Ancient Philosophy and Science", "jtitle"] = "Apeiron" | |
ax = axs[i] | |
sns.barplot(ax=ax, x="Proportion Female", y="jtitle", hue="Type", dodge=False, data=dfs[i], hue_order=TYPE_ORDER[0:2]) | |
plt.gcf().text(0, .5, "Journals", fontsize=12, rotation=90) | |
axs[0].set_title('Highest 10 Philosophy Journals (2000s)', fontsize=13, fontweight="bold") | |
axs[0].set(xlabel=None, ylabel=None) | |
axs[1].set_title('Lowest 10 Philosophy Journals (2000s)', fontsize=13, fontweight="bold") | |
axs[1].set(ylabel=None, xlabel="Proportion of Women Authorships", xlim=(0,1)) | |
axs[0].legend(loc='upper center', bbox_to_anchor=(0.6, 1.25), fancybox=False, shadow=False, frameon=False, ncol=3) | |
axs[1].get_legend().remove() | |
fig.align_ylabels(axs[:]) | |
plt.subplots_adjust(wspace=0, hspace=0) | |
fig.tight_layout() | |
plt.savefig(IMAGES_FOLDER + "Fig 1 - Phil Journal Comparison.tif") | |
plt.show() | |
#%% Plot Journal performance across time and in 2000s | |
dfs = [ journals_all.nlargest(10, "Proportion Female").sort_values(by=['Proportion Female'], ascending=False), | |
journals_all.nsmallest(10, "Proportion Female").sort_values(by=['Proportion Female'], ascending=False), | |
journals_byDecade[journals_byDecade.Decade==2000].nlargest(10, "Proportion Female").sort_values(by=['Proportion Female'], ascending=False), | |
journals_byDecade[journals_byDecade.Decade==2000].nsmallest(10, "Proportion Female").sort_values(by=['Proportion Female'], ascending=False), | |
] | |
fig,axs = plt.subplots(4, sharex="all") | |
fig.set_size_inches(9,10) | |
for i in range(0,4): | |
# Hack to trim journal name for graph | |
dfs[i].loc[dfs[i].jtitle == "Erkenntnis (1975-)", "jtitle"] = "Erkenntnis" | |
ax = axs[i] | |
sns.barplot(ax=ax, x="Proportion Female", y="jtitle", hue="Type", dodge=False, data=dfs[i], hue_order=TYPE_ORDER) | |
ax.set(xlabel=None, ylabel="Highest 10 Journals \n(1900-2009)") | |
ax.get_legend().remove() | |
#Add number of damples to thr tight hand side | |
ax_right = ax.twinx() | |
ax_right.yaxis.set_ticks(np.arange(0.5, 10, 1)) | |
ax_right.set_ylim(0,10) | |
ax_right.set_yticklabels(dfs[i]['Number Authors']) | |
ax.tick_params(axis='both', which='both', length=0) | |
ax_right.tick_params(axis='both', which='both', length=0) | |
axs[0].set(ylabel="Highest 10 Journals \n(1900 - 2009)", xlabel=None) | |
axs[1].set(ylabel="Lowest 10 Journals \n(1900 - 2009)", xlabel=None) | |
axs[2].set(ylabel="Highest 10 Journals \n(2000s)", xlabel=None) | |
axs[3].set(xlabel="Proportion of Women Authorships", ylabel="Lowest 10 Journals \n(2000s)", xlim=(0,1)) | |
axs[0].legend(loc='upper center', bbox_to_anchor=(0.37, 1.20), fancybox=False, shadow=False, frameon=False, ncol=3) | |
#Add dividing line | |
line = Line2D([50, 2590], [1513, 1513], color='black',) | |
fig.lines.append(line) | |
plt.gcf().text(0.945, .963, "n =", fontsize=13, fontweight="bold") | |
fig.align_ylabels(axs[:]) | |
plt.subplots_adjust(wspace=0, hspace=0) | |
fig.tight_layout() | |
plt.savefig(IMAGES_FOLDER + "Fig 2 - Outliers.tif") | |
plt.show() | |
#%% BoxPlot of total proportion women 1950-2010 by type | |
df = journals_byYear.groupby(["Type", "jtitle","year"], as_index=False).sum().copy() | |
calc(df) | |
ax = sns.boxplot(x="Type", y="Proportion Female", data=df, order=TYPE_ORDER, notch= True, medianprops={"color":"cyan", "lw":4}) | |
ax.set(xlim=(-0.5,3.2), xlabel=None, ylabel='Proportion of Women Authorships') | |
df_sample_size = journals_byYear.groupby(["Type"]).size() | |
print_sample_size(ax, df_sample_size, order=TYPE_ORDER) | |
fig.tight_layout() | |
plt.savefig(IMAGES_FOLDER + "Fig 3 - Category Distributions.tif") | |
plt.show() | |
#%% Plot Types by Decade (Line plot) | |
df_phil = journals_byYear.groupby(["Decade","Type"], as_index=False).sum().copy() | |
calc(df_phil) | |
fig,axs = plt.subplots(2, sharex="all", gridspec_kw={'height_ratios': [1, 3]}) | |
sns.lineplot(ax=axs[0], x="Decade", y="Number Authors", hue="Type", data=df_phil, hue_order=TYPE_ORDER, ci=None, markers=True, dashes=True, style="Type", lw=3) | |
sns.lineplot(ax=axs[1], x="Decade", y="Proportion Female", hue="Type", data=df_phil, hue_order=TYPE_ORDER, ci=None, markers=True, dashes=True, style="Type", lw=3) | |
handles, labels = axs[0].get_legend_handles_labels() | |
axs[0].legend(handles=handles[1:], labels=labels[1:], loc='upper center', bbox_to_anchor=(0.5, 1.35), fancybox=False, shadow=False, frameon=False, ncol=4) | |
axs[1].get_legend().remove() | |
axs[0].set(ylim=(0,5800), yticks=[0,2500,5000], ylabel="Number of \n Authorships", xlabel=None) | |
axs[1].set(xlim=(1900,2000), xticks=df_phil.Decade.unique(), ylabel="Proportion of \nWomen Authorships") | |
fig.align_ylabels(axs[:]) | |
plt.subplots_adjust(wspace=0, hspace=0) | |
fig.tight_layout() | |
plt.savefig(IMAGES_FOLDER + "Fig 4 - Lineplot by Decade.tif") | |
plt.show() | |
#%% Plot Types by Decade 1950-2000s (BoxPlot) | |
df = journals_byYear[ (journals_byYear.Decade >= 1950)].copy() | |
calc(df) | |
ax = sns.boxplot(x="Decade", y="Proportion Female", hue="Type", data=df, hue_order=TYPE_ORDER, notch= True, medianprops={"color":"cyan", "lw":4}) | |
ax.legend( loc='upper center', bbox_to_anchor=(0.5, 1.17), fancybox=False, shadow=False, frameon=False, ncol=3) | |
ax.set(ylabel="Proportion of Women Authorships") | |
df_sample_size = journals_byYear.groupby(["Decade","Type"]).size() | |
print_sample_size(ax, df_sample_size, order=range(1950,2009,10), hue_order=TYPE_ORDER) | |
fig.tight_layout() | |
plt.savefig(IMAGES_FOLDER + "Fig 5 - Boxplot By Decade.tif") | |
plt.show() | |
#%% BoxPlot of total proportion women 2000s by type | |
df = journals_byYear[ (journals_byYear.Decade == 2000)].groupby(["Type", "jtitle","year"], as_index=False).sum().copy() | |
calc(df) | |
ax = sns.boxplot(x="Type", y="Proportion Female", data=df, order=TYPE_ORDER, notch= True, medianprops={"color":"cyan", "lw":4}) | |
ax.set(xlabel=None, ylabel='Proportion of Women Authorships') | |
ax.axhline(y=0.2201996477, xmin=0.01, xmax=0.99, color="#701070", linestyle="--", lw=4) # All Program | |
handles, _ = ax.get_legend_handles_labels() | |
handles.append(Line2D([0], [0], color="#701070", linestyle="--", label='Proportion of Women Faculty at 98 Programs in 2010')) | |
ax.legend(handles=handles, loc='upper center', bbox_to_anchor=(0.5, 1.15), fancybox=False, shadow=False, frameon=False, ncol=4) | |
df_sample_size = journals_byYear[ journals_byYear.Decade == 2000 ].groupby("Type").size() | |
print_sample_size(ax, df_sample_size, order=TYPE_ORDER) | |
fig.tight_layout() | |
plt.savefig(IMAGES_FOLDER + "Fig 7 - BoxPlot 2000s vs Faculty 2010.tif") | |
plt.show() | |
#%% Boxplot AOS (2000s) | |
df = journals_byYear[ journals_byYear.Decade == 2000 ].groupby(["Type", "jtitle", "year", "AOS"], as_index=False).sum().copy() | |
df = df[df.Type != "Interdisciplinary"] | |
calc(df) | |
types = ["Top-Philosophy","Non-Top Philosophy"] | |
ax = sns.boxplot(x="AOS", y="Proportion Female", hue="Type", data=df, order=AOS_ORDER, hue_order=types, notch= True, medianprops={"color":"cyan", "lw":4}) | |
ax.axhline(y=0.245, xmin=0.01, xmax=0.19, color="#b319b3", linestyle="--", lw=4) # Faculty G | |
ax.axhline(y=0.297, xmin=0.21, xmax=0.39, color="#b319b3", linestyle="--", lw=4) # Faculty H | |
ax.axhline(y=0.196, xmin=0.41, xmax=0.59, color="#b319b3", linestyle="--", lw=4) # Faculty LEMM | |
ax.axhline(y=0.156, xmin=0.61, xmax=0.79, color="#b319b3", linestyle="--", lw=4) # Faculty S | |
ax.axhline(y=0.338, xmin=0.81, xmax=0.99, color="#b319b3", linestyle="--", lw=4) # Faculty V | |
ax.set(ylabel='Proportion of Women Authorships', xlabel="Area of Specialization") | |
handles, _ = ax.get_legend_handles_labels() | |
handles.append(Line2D([0], [0], color="#b319b3", linestyle="--", label='Faculty (2014)')) | |
ax.legend(handles=handles, loc='upper center', bbox_to_anchor=(0.5, 1.15), fancybox=False, shadow=False, frameon=False, ncol=4) | |
df_sample_size = journals_byYear[ journals_byYear.Decade == 2000 ].groupby(["AOS","Type"]).size() | |
print_sample_size(ax, df_sample_size, order=AOS_ORDER, hue_order=types, offset=0.35, x_adjust=0.2) | |
fig.tight_layout() | |
plt.savefig(IMAGES_FOLDER + "Fig 8 - Boxplot 2000s AOS vs Faculty 2014 AOS.tif") | |
plt.show() | |
#%% Boxplot review type by Journal 2000s | |
df = journals_byYear[ journals_byYear.Decade == 2000 ].groupby(["Type", "jtitle", "year", "Review type"], as_index=False).sum().copy() | |
calc(df) | |
ax = sns.boxplot(x="Review type", y="Proportion Female", hue="Type", data=df, order=REVIEW_ORDER, hue_order=TYPE_ORDER, notch= True, medianprops={"color":"cyan", "lw":4}) | |
ax.set(ylabel='Proportion of Women Authorships', xlabel="Review Type") | |
ax.legend( loc='upper center', bbox_to_anchor=(0.5, 1.15), fancybox=False, shadow=False, frameon=False, ncol=4) | |
df_sample_size = journals_byYear[ journals_byYear.Decade == 2000 ].groupby(["Review type","Type"]).size() | |
print_sample_size(ax, df_sample_size, order=REVIEW_ORDER, hue_order=TYPE_ORDER) | |
fig.tight_layout() | |
plt.savefig(IMAGES_FOLDER + "Fig 10 - Boxplot 2000s Review Type.tif") | |
plt.show() | |
#%%% Setup stats func | |
def stats_uni(data, group1, filename, group2="Type", norm_value="Proportion Female", nb_value="# Female", useOffset=True ): | |
def slugify(text, delim=''): | |
_punctuation_re = re.compile(r'[\t !"#$%&\'()*\-/<=>?@\[\\\]^_`{|},.]+') | |
result = [] | |
for word in _punctuation_re.split(text.lower()): | |
word = normalize('NFKD', word) \ | |
.encode('ascii', 'ignore') \ | |
.decode('utf-8') | |
if word: | |
result.append(word) | |
return delim.join(result) | |
def lrtest(llmin, llmax): | |
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df) | |
lr = 2 * (llmax - llmin) | |
p = stats.chisqprob(lr, 1) # llmax has 1 dof more than llmin | |
return lr, p | |
NB_V = slugify(nb_value) | |
NORM_V = slugify(norm_value) | |
G1 = slugify(group1) | |
G2 = slugify(group2) | |
totalResults = pd.DataFrame() | |
for jType in TYPE_ORDER: | |
file = open(f"{STATS_FOLDER}{filename}_for_{jType}.txt", 'w') | |
df = data[data.Type == jType].copy() | |
calc(df) | |
#Relable for model friendly variable names | |
df.rename(columns={norm_value: NORM_V, nb_value:NB_V, group1: G1}, inplace=True) | |
keys = pd.DataFrame(data={G1:df[G1].unique()} ) | |
offset = np.log(df["Number Authors"]) if useOffset else None | |
#Print quick stats summary | |
print(f"{filename} :: Grouped by G1='{group1}' \n\n", file=file) | |
sum = rp.summary_cont(df[[NORM_V, G1]].groupby([G1])) | |
print(sum, end="\n\n", file=file) | |
#RUN GLM using a negative binomial distribution with log(# authors) as offset | |
model_glm_nb = glm(f" {NB_V} ~ C({G1})", data=df, family=sm.families.NegativeBinomial(), offset=offset).fit() | |
print("Note: NB model coeficients need to be exponentiated before use", file=file) | |
print(model_glm_nb.summary(), end="\n\n", file=file) | |
print(model_glm_nb.t_test_pairwise(f"C({G1})").result_frame, end="\n\n", file=file) | |
results = model_glm_nb.get_prediction(keys).summary_frame().merge(keys, left_index=True, right_index=True) | |
results["Type"] = jType | |
results["Model"] = "GLM_NB" | |
totalResults = totalResults.append(results) | |
# RUN GLM using a Gaussian distribution | |
model_glm_norm = glm(f" {NORM_V} ~ C({G1})", data=df).fit() | |
print(model_glm_norm.summary(), end="\n\n", file=file) | |
print(model_glm_norm.t_test_pairwise(f"C({G1})").result_frame, end="\n\n", file=file) | |
results = model_glm_norm.get_prediction(keys).summary_frame().merge(keys, left_index=True, right_index=True) | |
results["Type"] = jType | |
results["Model"] = "GLM_NORM" | |
totalResults = totalResults.append(results) | |
# RUN OLS using a Gaussian distribution | |
model_ols_norm = ols(f" {NORM_V} ~ C({G1})", data=df).fit() | |
print(model_ols_norm.summary(), end="\n\n", file=file) | |
print(model_ols_norm.t_test_pairwise(f"C({G1})").result_frame, end="\n\n", file=file) | |
results = model_ols_norm.get_prediction(keys).summary_frame().merge(keys, left_index=True, right_index=True) | |
results["Type"] = jType | |
results["Model"] = "OLS_NORM" | |
totalResults = totalResults.append(results) | |
# RUN basic Anova Tests | |
print("\n--[Anova Table]--:", file=file) | |
aov = sm.stats.anova_lm(model_ols_norm, typ=3) | |
aov['mean_sq'] = aov[:]['sum_sq'] / aov[:]['df'] | |
aov['eta_sq (R2)'] = aov[:-1]['sum_sq'] / aov['sum_sq'].sum() | |
aov['omega_sq'] = (aov[:-1]['sum_sq'] - (aov[:-1]['df'] * aov['mean_sq'][-1])) / (aov['sum_sq'].sum() + aov['mean_sq'][-1]) | |
print(aov, file=file) | |
w, pvalue = stats.shapiro(model_ols_norm.resid) | |
print("\n--[Shapiro-Wilk]--\n w=%f p=%f" % (w, pvalue), file=file) | |
file.close() | |
# RUN GLM using without Stratification | |
file = open(f"{STATS_FOLDER}{filename}_multi.txt", 'w') | |
df = data.copy() | |
calc(df) | |
#Relable for model friendly variable names | |
df.rename(columns={norm_value: NORM_V, nb_value:NB_V, group1: G1, group2: G2}, inplace=True) | |
offset = np.log(df["Number Authors"]) if useOffset else None | |
#Print quick stats summary | |
model_glm_mixed = glm(f" {NB_V} ~ C({G1}) + C({G2})", data=df, family=sm.families.NegativeBinomial(), offset=offset).fit() | |
model_glm_g1 = glm(f" {NB_V} ~ C({G1})", data=df, family=sm.families.NegativeBinomial(), offset=offset).fit() | |
model_glm_g2 = glm(f" {NB_V} ~ C({G2})", data=df, family=sm.families.NegativeBinomial(), offset=offset).fit() | |
print("Note: NB model coeficients need to be exponentiated before use", file=file) | |
print(f"{filename} :: Grouped by '{group1}' * '{group2}' \n\n", file=file) | |
print(model_glm_mixed.summary(), end="\n\n", file=file) | |
print(f"{filename} :: Grouped by '{group1}' \n\n", file=file) | |
print(model_glm_g1.summary(), end="\n\n", file=file) | |
lr, p = lrtest(model_glm_g1.llf, model_glm_mixed.llf) | |
print('LR test, p value: {:.2f}, {:.4f}'.format(lr, p), end="\n\n", file=file) | |
print(f"{filename} :: Grouped by '{group2}' \n\n", file=file) | |
print(model_glm_g2.summary(), end="\n\n", file=file) | |
lr, p = lrtest(model_glm_g2.llf, model_glm_mixed.llf) | |
print('LR test, p value: {:.2f}, {:.4f}'.format(lr, p), end="\n\n", file=file) | |
totalResults.to_csv(f"{STATS_FOLDER}{filename}_Estimates.csv") | |
return totalResults | |
#%%% Run Stats on Journal x Year Pairs | |
df = journals_byYear[ (journals_byYear.Decade >= 1950)].copy() | |
model_proportion_female = stats_uni(df, "Decade", "Journals_by_Decade") | |
model_num_female = stats_uni(df, "Decade", "Women_by_Decade", norm_value="# Female", nb_value="# Female", useOffset=False) | |
model_num_male = stats_uni(df, "Decade", "Men_by_Decade", norm_value="# Male", nb_value="# Male", useOffset=False) | |
#%%% Run Stats on Paper x Year Pairs | |
df = papers[(papers.Decade == 2000)].copy() | |
model_aos = stats_uni(df, "AOS", "Papers_by_AOS") | |
model_review = stats_uni(df, "Review type", "Papers_by_Review_type") | |
#%%% Graphs of GLM Model Proportion Female | |
model = "GLM_NB" | |
fig, ax = plt.subplots() | |
df = model_proportion_female[model_proportion_female.Model == model].sort_values(by=['decade']) | |
sns.lineplot(ax=ax, x="decade", y="mean", hue="Type", data=df, hue_order=TYPE_ORDER, markers=True, dashes=True, style="Type", lw=3) | |
ax.set(xlim=(1950, 2000), ylim=(0, 0.30), ylabel="Proportion of Women Authorships (Estimated)", xlabel="Decade") | |
handles, labels = ax.get_legend_handles_labels() | |
ax.legend(handles=handles[1:], labels=labels[1:], loc='upper center', bbox_to_anchor=(0.5, 1.1), fancybox=False, | |
shadow=False, frameon=False, ncol=3) | |
for jType in TYPE_ORDER: | |
series = df.loc[df.Type == jType] | |
ax.fill_between(series["decade"], series["mean_ci_lower"], series["mean_ci_upper"], alpha=.2) | |
fig.tight_layout() | |
plt.savefig(f"{IMAGES_FOLDER}Fig 6 - {model} Proportion By Decade.tif") | |
plt.show() | |
#%%% Graphs of GLM Model Number of Male | |
model = "GLM_NB" | |
fig, axs = plt.subplots(2, sharex="all", gridspec_kw={'height_ratios': [1, 3]}) | |
fig.set_size_inches(9,7) | |
df_female = model_num_female[model_num_female.Model == model].sort_values(by=['decade']) | |
df_male = model_num_male[model_num_male.Model == model].sort_values(by=['decade']) | |
sns.lineplot(ax=axs[0], x="decade", y="mean", hue="Type", data=df_female, hue_order=TYPE_ORDER, markers=True, dashes=True, style="Type", lw=3) | |
sns.lineplot(ax=axs[1], x="decade", y="mean", hue="Type", data=df_male, hue_order=TYPE_ORDER, markers=True, dashes=True, style="Type", lw=3) | |
for jType in TYPE_ORDER: | |
series_f = df_female.loc[df_female.Type == jType] | |
series_m = df_male.loc[df_male.Type == jType] | |
axs[0].fill_between(series_f["decade"], series_f["mean_ci_lower"], series_f["mean_ci_upper"], alpha=.2) | |
axs[1].fill_between(series_m["decade"], series_m["mean_ci_lower"], series_m["mean_ci_upper"], alpha=.2) | |
handles, labels = axs[0].get_legend_handles_labels() | |
axs[0].legend(handles=handles[1:], labels=labels[1:], loc='upper center', bbox_to_anchor=(0.5, 1.25), | |
fancybox=False, shadow=False, frameon=False, ncol=3) | |
axs[1].get_legend().remove() | |
axs[0].set(ylim=(0, 10), yticks=[0,5,10], ylabel="Estimated Average\n Number of Women") | |
axs[1].set(ylim=(0, 30), xlim=(1950, 2000), ylabel="Estimated Average\n Number of Men", xlabel="Decade") | |
fig.align_ylabels(axs[:]) | |
plt.subplots_adjust(wspace=0, hspace=0) | |
fig.tight_layout() | |
plt.savefig(f"{IMAGES_FOLDER}Fig 13 - {model} Number Women vs Men.tif") | |
plt.show() | |
#%%% Graphs of GLM Model for AOS | |
model = "GLM_NB" | |
fig, ax = plt.subplots() | |
df = model_aos[ (model_aos.Model == model) & (model_aos.Type != "Interdisciplinary")].sort_values(by=["Type"], ascending=False).sort_values(by=['aos']) | |
df['aos'] = df['aos'].replace( to_replace= {'G': 'General\n Specialization (G)', 'H': 'Historical\n Philosophy (H)', 'LEMM': 'Language,\n Epistemology,\n Metaphysics,\n and Mind (LEMM)', 'S': 'Logic and \nPhilosophy of \nScience (S)', 'V':'Value Theory (V)'}) | |
types = ["Top-Philosophy","Non-Top Philosophy"] | |
sns.barplot(ax=ax, x="aos", y="mean", hue="Type", data=df, hue_order=types) | |
ax.set( ylabel="Proportion of Women Authorships (Estimated)", xlabel="", ylim=(0,0.52)) | |
ax.axhline(y=0.245, xmin=0.05, xmax=0.19, color="#b319b3", linestyle="--", lw=4) # Faculty G | |
ax.axhline(y=0.297, xmin=0.25, xmax=0.39, color="#b319b3", linestyle="--", lw=4) # Faculty H | |
ax.axhline(y=0.196, xmin=0.42, xmax=0.58, color="#b319b3", linestyle="--", lw=4) # Faculty LEMM | |
ax.axhline(y=0.156, xmin=0.61, xmax=0.75, color="#b319b3", linestyle="--", lw=4) # Faculty S | |
ax.axhline(y=0.338, xmin=0.81, xmax=0.95, color="#b319b3", linestyle="--", lw=4) # Faculty V | |
handles, _ = ax.get_legend_handles_labels() | |
handles.append(Line2D([0], [0], color="#b319b3", linestyle="--", label='Faculty (2014)')) | |
ax.legend(handles=handles, loc='upper center', bbox_to_anchor=(0.5, 1.15), fancybox=False, | |
shadow=False, frameon=False, ncol=3) | |
#Hack to get error bars to print right on dodge bar plot | |
yerr = [df['mean'] - df['mean_ci_lower'], df['mean_ci_upper'] - df['mean']] | |
print("Warning!! using a workaround to generate CI on the bar plot, please double check numbers") | |
ax.errorbar([-0.2,0.2,1.2,1.8,2.2,2.8,3.8,4.2], df["mean"], yerr=yerr, fmt='none', c= 'black',capsize=5) | |
df_sample_size = papers[(papers.Decade == 2000)].groupby(["AOS","Type"]).size() | |
print_sample_size(ax, df_sample_size, order=AOS_ORDER, hue_order=types, y_adjust=0.53, offset=0.35, x_adjust=0.2) | |
fig.tight_layout() | |
plt.savefig(f"{IMAGES_FOLDER}Fig 9 - {model} 2000s vs AOS Faculty 2014.tif") | |
plt.show() | |
#%%% Graphs of GLM Model for Review Type | |
model = "GLM_NB" | |
fig, ax = plt.subplots() | |
df = model_review[ (model_review.Model == model) ].copy() | |
df.loc[df.reviewtype == "Non-Anonymous", "sort"] = 1 | |
df.loc[df.reviewtype == "Double Anonymous", "sort"] = 2 | |
df.loc[df.reviewtype == "Triple Anonymous", "sort"] = 3 | |
df = df.sort_values(by=["Type"], ascending=False).sort_values(by=["sort"]) | |
sns.barplot(ax=ax, x="reviewtype", y="mean", hue="Type", data=df, hue_order=TYPE_ORDER) | |
ax.set( ylabel="Proportion of Women Authorships (Estimated)", xlabel="Review Type") | |
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15), fancybox=False, shadow=False, frameon=False, ncol=3) | |
#Hack to get error bars to print right on dodge bar plot | |
yerr = [df['mean'] - df['mean_ci_lower'], df['mean_ci_upper'] - df['mean']] | |
print("Warning!! using a workaround to generate CI on the bar plot, please double check numbers") | |
ax.errorbar([-0.27,0,0.27,0.73,1,1.27,1.73,2.27], df["mean"], yerr=yerr, fmt='none', c= 'black',capsize=5) | |
df_sample_size = papers[(papers.Decade == 2000)].groupby(["Review type","Type"]).size() | |
print_sample_size(ax, df_sample_size, order=REVIEW_ORDER, hue_order=TYPE_ORDER, y_adjust=0.53) | |
fig.tight_layout() | |
plt.savefig(f"{IMAGES_FOLDER}Fig 11 - {model} 2000s Review Type.tif") | |
plt.show() | |
#%% Plot All Humanities vs Phil | |
df_west_humanities = west_humanities.copy() | |
df_west_all = west_all_fields.groupby(["Decade"], as_index=False).sum() | |
df_phil_combined = journals_byYear.groupby(["Decade"], as_index=False).sum().copy() | |
calc( df_phil_combined, df_west_all) | |
fig,ax = plt.subplots() | |
ax.plot(df_phil_combined["Decade"], df_phil_combined["Proportion Female"], label="All Philosophy", color="blue", linestyle="-", marker="o") | |
ax.plot(df_west_all["Decade"], df_west_all["Proportion Female"], label="All Fields", color="green", linestyle=":", marker="s") | |
ax.plot(df_west_humanities["Decade"], df_west_humanities["Proportion Female"], label="Humanities", color="goldenrod", linestyle="--", marker="d") | |
ax.legend( loc='upper center', bbox_to_anchor=(0.5, 1.1), fancybox=False, shadow=False, frameon=False, ncol=3) | |
ax.set(xlim=(1900,2000), xticks=df_phil.Decade.unique(), ylabel="Proportion of Women Authorships") | |
fig.tight_layout() | |
plt.savefig(IMAGES_FOLDER + "Fig 12 - Humanities vs Phil.tif") | |
plt.show() | |
#%% Plot CDF of # authors in each journal year paper | |
fig, axs = plt.subplots(2) | |
fig.set_size_inches(9,7) | |
axs[0].set( ylabel="Probability", xlabel="Number of Papers in (Journal,Year) Pairs", xlim=(0,120)) | |
axs[0].hist(papers.groupby(["jtitle","year"], as_index=False).count()["title"], bins=112, range=(0,150), density=True, cumulative=True, histtype='step') | |
axs[1].set( ylabel="Probability", xlabel="Number of Papers in (Journal,Decade) Pairs", xlim=(0,600)) | |
axs[1].hist(papers[ (papers.Decade >= 1950)].groupby(["jtitle","Decade"], as_index=False).count()["title"], bins=595, range=(0,650), density=True, cumulative=True, histtype='step') | |
plt.savefig(IMAGES_FOLDER + "Fig 14 - CDF Papers in Year Pairs.tif") | |
fig.tight_layout() | |
plt.show() | |
#%% Plot histogram of # authors in each journal year paper | |
fig, axs = plt.subplots(2) | |
fig.set_size_inches(9,7) | |
axs[0].set( ylabel="Count", xlabel="Number of Papers in (Journal,Year) Pairs 1900-2009") | |
axs[0].hist(papers.groupby(["jtitle","year"], as_index=False).count()["title"], bins=100, label="Papers in journal in a year", range=(0,112), histtype='step') | |
axs[1].set( ylabel="Count", xlabel="Number of Papers in (Journal,Decade) Pairs 1900-2009") | |
axs[1].hist(papers.groupby(["jtitle","Decade"], as_index=False).count()["title"], bins=100, label="Papers in journal in a year", range=(0,595), histtype='step') | |
fig.tight_layout() | |
plt.savefig(IMAGES_FOLDER + "Fig 14 (alt) - Hist Papers in Year Pairs.tif") | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment