mikrasov/jstor_demographics_analysis.py

## jstor_demographics_analysis.py
import re, os, itertools
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import researchpy as rp
import statsmodels.api as sm
from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.formula.api import glm
from matplotlib.lines import Line2D
from unicodedata import normalize

BASE_FOLDER = "data/"
IMAGES_FOLDER = "graphs/"
STATS_FOLDER = "stats/"
SUMMARIES_FOLDER = "summaries/"

TYPE_ORDER = ("Top-Philosophy","Non-Top Philosophy","Interdisciplinary")
REVIEW_ORDER = ("Non-Anonymous","Double Anonymous","Triple Anonymous")
AOS_ORDER = ("G","H", "LEMM","S","V")

pd.set_option('display.max_columns', None)
pd.set_option('expand_frame_repr', False)

plt.rcParams['figure.dpi'] = 300

def calc(*argv):
    for df in argv:
        df["Proportion Female"] = (df["# Female"] )/ ( df["# Male"] + df["# Female"])

#%% Load data
print("Reading files from '%s'"%BASE_FOLDER)
authors = pd.read_csv(BASE_FOLDER + 'authors.csv', low_memory=False, encoding="cp1252").drop_duplicates(subset="aID").set_index("aID").sort_index()
papers = pd.read_csv(BASE_FOLDER + 'papers.csv', low_memory=False, encoding="cp1252").drop_duplicates(subset="pID").set_index("pID").sort_index()
journals = pd.read_csv(BASE_FOLDER + 'journals_divided.csv', low_memory=False, encoding="cp1252")
editors = pd.read_csv(BASE_FOLDER + 'editors.csv', low_memory=False, encoding="cp1252")
surveys = pd.read_csv(BASE_FOLDER + 'surveys.csv', low_memory=False, encoding="cp1252")
west_all_fields = pd.read_csv(BASE_FOLDER + 'west_allfields.csv', low_memory=False, encoding="cp1252")
west_humanities = pd.read_csv(BASE_FOLDER + 'west_humanities.csv', low_memory=False, encoding="cp1252")


#%% Merge data
print("Processing Data...")


#Decode Journal Type
journals.loc[journals.Top == "Y","Type"] = "Top-Philosophy"
journals.loc[journals["Non Top"] == "Y","Type"] = "Non-Top Philosophy"
journals.loc[journals["Interdisciplinary"] == "Y","Type"] = "Interdisciplinary"
journals.drop(columns=["Top", "Non Top", "Interdisciplinary", "In Old Paper"], inplace=True)

#Rename review types
journals.loc[journals["Review type"] == "Not Blind","Review type"] = "Non-Anonymous"
journals.loc[journals["Review type"] == "Double Blind","Review type"] = "Double Anonymous"
journals.loc[journals["Review type"] == "Triple Blind","Review type"] = "Triple Anonymous"

#Pull out Gender between rows
authors = authors[authors.gender < 10] # Drop Unknown Gender
authors["# Male"]= authors.gender == 0
authors["# Female"]= authors.gender == 1
authors["Number Authors"] = 1

#Make author names all upper case
authors.firstname=authors.firstname.str.upper()
authors.lastname=authors.lastname.str.upper()

#Drop unused columns & years
authors.drop(columns=["gender", "sequence"], inplace=True)
papers.drop(columns=["cluster", "jID","EF", "firstauthor"], inplace=True)
papers = papers[ (papers.year >= 1900) & (papers.year < 2010)]

#Create a decades variable
papers["Decade"] = (papers.year//10)*10
west_all_fields["Decade"] = (west_all_fields.year//10)*10

#Build full authors table
authors_by_pid = authors.groupby("pID").sum()
authors = authors.merge(papers, left_on="pID", right_index=True)
authors = authors.merge(journals, left_on="jtitle", right_on="jtitle")

#build full papers table
papers = papers.merge(authors_by_pid, left_index=True, right_index=True)
papers = papers.merge(journals, left_on="jtitle", right_on="jtitle")
papers[["# Male","# Female","Number Authors"]] = papers[["# Male","# Female","Number Authors"]].astype('int64')

#Create commonly used summary tables
journals_byYear = papers.groupby(["jtitle","year","Decade","Type","AOS","Review type"], as_index=False).sum()
journals_byDecade = papers.groupby(["jtitle","Decade","Type","AOS","Review type"], as_index=False).sum().drop(columns=["year"])
journals_all = papers.groupby(["jtitle","Type","AOS","Review type"], as_index=False).sum().drop(columns=["year","Decade"])
type_byDecade = papers.groupby(["Decade","Type"], as_index=False).sum().drop(columns=["year"])

#Calculate Proportion Female for each Group
calc(journals_byYear, journals_all, journals_byDecade, type_byDecade)

#%% Save Summary file
print("Saving Data...")
if not os.path.exists(IMAGES_FOLDER):
    os.makedirs(IMAGES_FOLDER)
if not os.path.exists(STATS_FOLDER):
    os.makedirs(STATS_FOLDER)
if not os.path.exists(SUMMARIES_FOLDER):
    os.makedirs(SUMMARIES_FOLDER)
authors.to_csv(SUMMARIES_FOLDER + "authors_all.csv")
papers.to_csv(SUMMARIES_FOLDER + "papers_all.csv")
journals_all.to_csv(SUMMARIES_FOLDER + "journal_all_1900to2010.csv")
journals_byYear.to_csv(SUMMARIES_FOLDER + "journal_by_year.csv")
journals_byDecade.to_csv(SUMMARIES_FOLDER + "journal_by_decade.csv")
type_byDecade.to_csv(SUMMARIES_FOLDER + "type_by_decade.csv")


#%% Look at unique authorship
unique_authors = authors.groupby(["firstname","lastname"]).sum()
unique_authors_m = unique_authors[unique_authors["# Male"] > 0]
unique_authors_f = unique_authors[unique_authors["# Female"] > 0]


#%% Generate Graphs
print("Saving Graphs...")
sns.set(rc={'figure.figsize':(9,5)}, font="Calibri")
sns.set_style("whitegrid", {'axes.grid' : False})
sns.set_palette(sns.color_palette(["#2a70a2","#8abfdb","#c1daf0", "#701070"]))

#%% Function To Print Sample Size
def print_sample_size(ax, df_sample_size, order, hue_order=None, offset=0.27, x_adjust=0.0, y_adjust=1.07):
    ax.text(-0.55, y_adjust, "n=", horizontalalignment='center', size='small', fontweight="bold")

    if hue_order:
        for c, g1 in enumerate(order):
            x = c - offset + x_adjust
            for g2 in hue_order:
                s = df_sample_size[(g1, g2)] if (g1, g2) in df_sample_size else ""
                ax.text(x, y_adjust, s, horizontalalignment='center', size='small')
                x += offset
    else:
        for x, g1 in enumerate(order):
                s = df_sample_size[g1] if g1 in df_sample_size else ""
                ax.text(x, y_adjust, s, horizontalalignment='center', size='small')

#%% Plot Journal performance across time and in 2000s
dfs = [     journals_byDecade[(journals_byDecade.Type !=  "Interdisciplinary") & (journals_byDecade.Decade == 2000)].nlargest(10, "Proportion Female").sort_values(
            by=['Proportion Female'], ascending=False),
        journals_byDecade[(journals_byDecade.Type !=  "Interdisciplinary") & (journals_byDecade.Decade == 2000)].nsmallest(10, "Proportion Female").sort_values(
            by=['Proportion Female'], ascending=False)
      ]

fig,axs = plt.subplots(2, sharex="all")
fig.set_size_inches(9,8)


for i in range(0,2):

    # Hack to trim journal name for graph
    dfs[i].loc[dfs[i].jtitle == "Erkenntnis (1975-)", "jtitle"] = "Erkenntnis"
    dfs[i].loc[dfs[i].jtitle == "Apeiron: A Journal for Ancient Philosophy and Science", "jtitle"] = "Apeiron"

    ax = axs[i]
    sns.barplot(ax=ax, x="Proportion Female", y="jtitle", hue="Type", dodge=False,  data=dfs[i], hue_order=TYPE_ORDER[0:2])

plt.gcf().text(0, .5, "Journals", fontsize=12, rotation=90)
axs[0].set_title('Highest 10 Philosophy Journals (2000s)', fontsize=13, fontweight="bold")
axs[0].set(xlabel=None, ylabel=None)
axs[1].set_title('Lowest 10 Philosophy Journals (2000s)', fontsize=13, fontweight="bold")
axs[1].set(ylabel=None, xlabel="Proportion of Women Authorships", xlim=(0,1))
axs[0].legend(loc='upper center', bbox_to_anchor=(0.6, 1.25), fancybox=False, shadow=False, frameon=False, ncol=3)
axs[1].get_legend().remove()


fig.align_ylabels(axs[:])
plt.subplots_adjust(wspace=0, hspace=0)
fig.tight_layout()
plt.savefig(IMAGES_FOLDER + "Fig 1 - Phil Journal Comparison.tif")
plt.show()


#%% Plot Journal performance across time and in 2000s
dfs = [ journals_all.nlargest(10, "Proportion Female").sort_values(by=['Proportion Female'], ascending=False),
        journals_all.nsmallest(10, "Proportion Female").sort_values(by=['Proportion Female'], ascending=False),
        journals_byDecade[journals_byDecade.Decade==2000].nlargest(10, "Proportion Female").sort_values(by=['Proportion Female'], ascending=False),
        journals_byDecade[journals_byDecade.Decade==2000].nsmallest(10, "Proportion Female").sort_values(by=['Proportion Female'], ascending=False),
      ]

fig,axs = plt.subplots(4, sharex="all")
fig.set_size_inches(9,10)


for i in range(0,4):

    # Hack to trim journal name for graph
    dfs[i].loc[dfs[i].jtitle == "Erkenntnis (1975-)", "jtitle"] = "Erkenntnis"

    ax = axs[i]
    sns.barplot(ax=ax, x="Proportion Female", y="jtitle", hue="Type", dodge=False,  data=dfs[i], hue_order=TYPE_ORDER)
    ax.set(xlabel=None, ylabel="Highest 10 Journals  \n(1900-2009)")

    ax.get_legend().remove()

    #Add number of damples to thr tight hand side
    ax_right = ax.twinx()
    ax_right.yaxis.set_ticks(np.arange(0.5, 10, 1))
    ax_right.set_ylim(0,10)
    ax_right.set_yticklabels(dfs[i]['Number Authors'])
    ax.tick_params(axis='both', which='both', length=0)
    ax_right.tick_params(axis='both', which='both', length=0)
axs[0].set(ylabel="Highest 10 Journals \n(1900 - 2009)", xlabel=None)
axs[1].set(ylabel="Lowest 10 Journals \n(1900 - 2009)", xlabel=None)
axs[2].set(ylabel="Highest 10 Journals \n(2000s)", xlabel=None)
axs[3].set(xlabel="Proportion of Women Authorships", ylabel="Lowest 10 Journals \n(2000s)", xlim=(0,1))
axs[0].legend(loc='upper center', bbox_to_anchor=(0.37, 1.20), fancybox=False, shadow=False, frameon=False, ncol=3)


#Add dividing line
line = Line2D([50, 2590], [1513, 1513], color='black',)
fig.lines.append(line)

plt.gcf().text(0.945, .963, "n =", fontsize=13, fontweight="bold")
fig.align_ylabels(axs[:])
plt.subplots_adjust(wspace=0, hspace=0)
fig.tight_layout()
plt.savefig(IMAGES_FOLDER + "Fig 2 - Outliers.tif")
plt.show()


#%% BoxPlot of total proportion women 1950-2010 by type
df = journals_byYear.groupby(["Type", "jtitle","year"], as_index=False).sum().copy()
calc(df)
ax = sns.boxplot(x="Type", y="Proportion Female",   data=df, order=TYPE_ORDER, notch= True, medianprops={"color":"cyan", "lw":4})
ax.set(xlim=(-0.5,3.2), xlabel=None,  ylabel='Proportion of Women Authorships')

df_sample_size = journals_byYear.groupby(["Type"]).size()
print_sample_size(ax, df_sample_size, order=TYPE_ORDER)

fig.tight_layout()
plt.savefig(IMAGES_FOLDER + "Fig 3 - Category Distributions.tif")
plt.show()


#%% Plot Types by Decade (Line plot)
df_phil = journals_byYear.groupby(["Decade","Type"], as_index=False).sum().copy()
calc(df_phil)

fig,axs = plt.subplots(2, sharex="all", gridspec_kw={'height_ratios': [1, 3]})
sns.lineplot(ax=axs[0], x="Decade", y="Number Authors", hue="Type", data=df_phil, hue_order=TYPE_ORDER, ci=None, markers=True, dashes=True, style="Type", lw=3)
sns.lineplot(ax=axs[1], x="Decade", y="Proportion Female", hue="Type", data=df_phil, hue_order=TYPE_ORDER, ci=None, markers=True, dashes=True, style="Type", lw=3)

handles, labels = axs[0].get_legend_handles_labels()
axs[0].legend(handles=handles[1:], labels=labels[1:], loc='upper center', bbox_to_anchor=(0.5, 1.35), fancybox=False, shadow=False, frameon=False, ncol=4)
axs[1].get_legend().remove()
axs[0].set(ylim=(0,5800), yticks=[0,2500,5000], ylabel="Number of \n Authorships", xlabel=None)
axs[1].set(xlim=(1900,2000), xticks=df_phil.Decade.unique(), ylabel="Proportion of \nWomen Authorships")
fig.align_ylabels(axs[:])
plt.subplots_adjust(wspace=0, hspace=0)
fig.tight_layout()
plt.savefig(IMAGES_FOLDER + "Fig 4 - Lineplot by Decade.tif")
plt.show()


#%% Plot Types by Decade 1950-2000s (BoxPlot)
df = journals_byYear[ (journals_byYear.Decade >= 1950)].copy()
calc(df)
ax = sns.boxplot(x="Decade", y="Proportion Female", hue="Type", data=df, hue_order=TYPE_ORDER, notch= True, medianprops={"color":"cyan", "lw":4})
ax.legend( loc='upper center', bbox_to_anchor=(0.5, 1.17), fancybox=False, shadow=False, frameon=False, ncol=3)
ax.set(ylabel="Proportion of Women Authorships")

df_sample_size = journals_byYear.groupby(["Decade","Type"]).size()
print_sample_size(ax, df_sample_size, order=range(1950,2009,10), hue_order=TYPE_ORDER)

fig.tight_layout()
plt.savefig(IMAGES_FOLDER + "Fig 5 - Boxplot By Decade.tif")
plt.show()


#%% BoxPlot of total proportion women 2000s by type
df = journals_byYear[ (journals_byYear.Decade == 2000)].groupby(["Type", "jtitle","year"], as_index=False).sum().copy()
calc(df)
ax = sns.boxplot(x="Type", y="Proportion Female",   data=df, order=TYPE_ORDER, notch= True, medianprops={"color":"cyan", "lw":4})
ax.set(xlabel=None,  ylabel='Proportion of Women Authorships')
ax.axhline(y=0.2201996477, xmin=0.01, xmax=0.99, color="#701070",  linestyle="--", lw=4) # All Program

handles, _ = ax.get_legend_handles_labels()
handles.append(Line2D([0], [0], color="#701070", linestyle="--", label='Proportion of Women Faculty at 98 Programs in 2010'))
ax.legend(handles=handles, loc='upper center', bbox_to_anchor=(0.5, 1.15), fancybox=False, shadow=False, frameon=False, ncol=4)

df_sample_size = journals_byYear[ journals_byYear.Decade == 2000 ].groupby("Type").size()
print_sample_size(ax, df_sample_size, order=TYPE_ORDER)

fig.tight_layout()
plt.savefig(IMAGES_FOLDER + "Fig 7 - BoxPlot 2000s vs Faculty 2010.tif")
plt.show()


#%% Boxplot AOS (2000s)
df = journals_byYear[ journals_byYear.Decade == 2000 ].groupby(["Type", "jtitle", "year", "AOS"], as_index=False).sum().copy()
df = df[df.Type != "Interdisciplinary"]
calc(df)

types = ["Top-Philosophy","Non-Top Philosophy"]

ax = sns.boxplot(x="AOS", y="Proportion Female", hue="Type", data=df, order=AOS_ORDER, hue_order=types, notch= True, medianprops={"color":"cyan", "lw":4})
ax.axhline(y=0.245, xmin=0.01, xmax=0.19, color="#b319b3", linestyle="--", lw=4) # Faculty G
ax.axhline(y=0.297, xmin=0.21, xmax=0.39, color="#b319b3", linestyle="--", lw=4) # Faculty H
ax.axhline(y=0.196, xmin=0.41, xmax=0.59, color="#b319b3", linestyle="--", lw=4) # Faculty LEMM
ax.axhline(y=0.156, xmin=0.61, xmax=0.79, color="#b319b3", linestyle="--", lw=4) # Faculty S
ax.axhline(y=0.338, xmin=0.81, xmax=0.99, color="#b319b3", linestyle="--", lw=4) # Faculty V
ax.set(ylabel='Proportion of Women Authorships', xlabel="Area of Specialization")

handles, _ = ax.get_legend_handles_labels()
handles.append(Line2D([0], [0], color="#b319b3", linestyle="--",  label='Faculty (2014)'))
ax.legend(handles=handles, loc='upper center', bbox_to_anchor=(0.5, 1.15), fancybox=False, shadow=False, frameon=False, ncol=4)

df_sample_size = journals_byYear[ journals_byYear.Decade == 2000 ].groupby(["AOS","Type"]).size()
print_sample_size(ax, df_sample_size, order=AOS_ORDER, hue_order=types, offset=0.35, x_adjust=0.2)

fig.tight_layout()
plt.savefig(IMAGES_FOLDER + "Fig 8 - Boxplot 2000s AOS vs Faculty 2014 AOS.tif")
plt.show()


#%% Boxplot review type by Journal 2000s
df = journals_byYear[ journals_byYear.Decade == 2000 ].groupby(["Type", "jtitle", "year", "Review type"], as_index=False).sum().copy()
calc(df)
ax = sns.boxplot(x="Review type", y="Proportion Female", hue="Type",   data=df, order=REVIEW_ORDER, hue_order=TYPE_ORDER, notch= True, medianprops={"color":"cyan", "lw":4})
ax.set(ylabel='Proportion of Women Authorships', xlabel="Review Type")
ax.legend( loc='upper center', bbox_to_anchor=(0.5, 1.15), fancybox=False, shadow=False, frameon=False, ncol=4)

df_sample_size = journals_byYear[ journals_byYear.Decade == 2000 ].groupby(["Review type","Type"]).size()
print_sample_size(ax, df_sample_size, order=REVIEW_ORDER, hue_order=TYPE_ORDER)

fig.tight_layout()
plt.savefig(IMAGES_FOLDER + "Fig 10 - Boxplot 2000s Review Type.tif")
plt.show()


#%%% Setup stats func
def stats_uni(data, group1, filename, group2="Type", norm_value="Proportion Female", nb_value="# Female", useOffset=True ):


    def slugify(text, delim=''):
        _punctuation_re = re.compile(r'[\t !"#$%&\'()*\-/<=>?@\[\\\]^_`{|},.]+')
        result = []
        for word in _punctuation_re.split(text.lower()):
            word = normalize('NFKD', word) \
                .encode('ascii', 'ignore') \
                .decode('utf-8')
            if word:
                result.append(word)

        return delim.join(result)

    def lrtest(llmin, llmax):
        stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

        lr = 2 * (llmax - llmin)
        p = stats.chisqprob(lr, 1)  # llmax has 1 dof more than llmin
        return lr, p

    NB_V = slugify(nb_value)
    NORM_V = slugify(norm_value)
    G1 = slugify(group1)
    G2 = slugify(group2)

    totalResults = pd.DataFrame()
    for jType in TYPE_ORDER:
        file = open(f"{STATS_FOLDER}{filename}_for_{jType}.txt", 'w')
        df = data[data.Type == jType].copy()
        calc(df)

        #Relable for model friendly variable names
        df.rename(columns={norm_value: NORM_V, nb_value:NB_V, group1: G1}, inplace=True)

        keys = pd.DataFrame(data={G1:df[G1].unique()} )
        offset = np.log(df["Number Authors"]) if useOffset else None

        #Print quick stats summary
        print(f"{filename} :: Grouped by G1='{group1}' \n\n", file=file)
        sum = rp.summary_cont(df[[NORM_V, G1]].groupby([G1]))
        print(sum, end="\n\n", file=file)

        #RUN GLM using a negative binomial distribution with log(# authors) as offset
        model_glm_nb = glm(f" {NB_V} ~ C({G1})", data=df, family=sm.families.NegativeBinomial(), offset=offset).fit()
        print("Note: NB model coeficients need to be exponentiated before use",  file=file)
        print(model_glm_nb.summary(), end="\n\n", file=file)
        print(model_glm_nb.t_test_pairwise(f"C({G1})").result_frame, end="\n\n", file=file)
        results =  model_glm_nb.get_prediction(keys).summary_frame().merge(keys, left_index=True, right_index=True)
        results["Type"] = jType
        results["Model"] = "GLM_NB"
        totalResults = totalResults.append(results)


        # RUN GLM using a Gaussian distribution
        model_glm_norm = glm(f" {NORM_V} ~ C({G1})", data=df).fit()
        print(model_glm_norm.summary(), end="\n\n", file=file)
        print(model_glm_norm.t_test_pairwise(f"C({G1})").result_frame, end="\n\n", file=file)
        results = model_glm_norm.get_prediction(keys).summary_frame().merge(keys, left_index=True, right_index=True)
        results["Type"] = jType
        results["Model"] = "GLM_NORM"
        totalResults = totalResults.append(results)

        # RUN OLS using a Gaussian distribution
        model_ols_norm = ols(f" {NORM_V} ~ C({G1})", data=df).fit()
        print(model_ols_norm.summary(), end="\n\n", file=file)
        print(model_ols_norm.t_test_pairwise(f"C({G1})").result_frame, end="\n\n", file=file)
        results = model_ols_norm.get_prediction(keys).summary_frame().merge(keys, left_index=True, right_index=True)
        results["Type"] = jType
        results["Model"] = "OLS_NORM"
        totalResults = totalResults.append(results)

        # RUN basic Anova Tests
        print("\n--[Anova Table]--:", file=file)
        aov = sm.stats.anova_lm(model_ols_norm, typ=3)

        aov['mean_sq'] = aov[:]['sum_sq'] / aov[:]['df']
        aov['eta_sq (R2)'] = aov[:-1]['sum_sq'] / aov['sum_sq'].sum()
        aov['omega_sq'] = (aov[:-1]['sum_sq'] - (aov[:-1]['df'] * aov['mean_sq'][-1])) / (aov['sum_sq'].sum() + aov['mean_sq'][-1])
        print(aov, file=file)

        w, pvalue = stats.shapiro(model_ols_norm.resid)
        print("\n--[Shapiro-Wilk]--\n w=%f p=%f" % (w, pvalue), file=file)
        file.close()

    # RUN GLM using without Stratification
    file = open(f"{STATS_FOLDER}{filename}_multi.txt", 'w')
    df = data.copy()
    calc(df)


    #Relable for model friendly variable names
    df.rename(columns={norm_value: NORM_V, nb_value:NB_V, group1: G1, group2: G2}, inplace=True)

    offset = np.log(df["Number Authors"]) if useOffset else None

    #Print quick stats summary
    model_glm_mixed = glm(f" {NB_V} ~ C({G1}) + C({G2})", data=df, family=sm.families.NegativeBinomial(), offset=offset).fit()
    model_glm_g1 = glm(f" {NB_V} ~ C({G1})", data=df, family=sm.families.NegativeBinomial(), offset=offset).fit()
    model_glm_g2 = glm(f" {NB_V} ~ C({G2})", data=df, family=sm.families.NegativeBinomial(), offset=offset).fit()

    print("Note: NB model coeficients need to be exponentiated before use",  file=file)

    print(f"{filename} :: Grouped by '{group1}' * '{group2}' \n\n", file=file)
    print(model_glm_mixed.summary(), end="\n\n", file=file)

    print(f"{filename} :: Grouped by '{group1}' \n\n", file=file)
    print(model_glm_g1.summary(), end="\n\n", file=file)
    lr, p = lrtest(model_glm_g1.llf, model_glm_mixed.llf)
    print('LR test, p value: {:.2f}, {:.4f}'.format(lr, p), end="\n\n", file=file)

    print(f"{filename} :: Grouped by '{group2}' \n\n", file=file)
    print(model_glm_g2.summary(), end="\n\n", file=file)
    lr, p = lrtest(model_glm_g2.llf, model_glm_mixed.llf)
    print('LR test, p value: {:.2f}, {:.4f}'.format(lr, p), end="\n\n", file=file)

    totalResults.to_csv(f"{STATS_FOLDER}{filename}_Estimates.csv")
    return totalResults


#%%% Run Stats on Journal x Year Pairs
df = journals_byYear[  (journals_byYear.Decade >= 1950)].copy()
model_proportion_female = stats_uni(df, "Decade", "Journals_by_Decade")
model_num_female = stats_uni(df, "Decade", "Women_by_Decade", norm_value="# Female", nb_value="# Female", useOffset=False)
model_num_male = stats_uni(df, "Decade", "Men_by_Decade", norm_value="# Male", nb_value="# Male", useOffset=False)


#%%% Run Stats on Paper x Year Pairs
df = papers[(papers.Decade == 2000)].copy()
model_aos = stats_uni(df, "AOS", "Papers_by_AOS")
model_review = stats_uni(df, "Review type", "Papers_by_Review_type")

#%%% Graphs of GLM Model Proportion Female
model = "GLM_NB"
fig, ax = plt.subplots()

df = model_proportion_female[model_proportion_female.Model == model].sort_values(by=['decade'])
sns.lineplot(ax=ax, x="decade", y="mean", hue="Type", data=df, hue_order=TYPE_ORDER, markers=True, dashes=True, style="Type", lw=3)
ax.set(xlim=(1950, 2000), ylim=(0, 0.30), ylabel="Proportion of Women Authorships (Estimated)", xlabel="Decade")

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[1:], labels=labels[1:], loc='upper center', bbox_to_anchor=(0.5, 1.1), fancybox=False,
          shadow=False, frameon=False, ncol=3)

for jType in TYPE_ORDER:
    series = df.loc[df.Type == jType]
    ax.fill_between(series["decade"], series["mean_ci_lower"], series["mean_ci_upper"], alpha=.2)
fig.tight_layout()
plt.savefig(f"{IMAGES_FOLDER}Fig 6 - {model} Proportion By Decade.tif")
plt.show()


#%%% Graphs of GLM Model Number of Male
model = "GLM_NB"
fig, axs = plt.subplots(2, sharex="all", gridspec_kw={'height_ratios': [1, 3]})
fig.set_size_inches(9,7)

df_female = model_num_female[model_num_female.Model == model].sort_values(by=['decade'])
df_male = model_num_male[model_num_male.Model == model].sort_values(by=['decade'])

sns.lineplot(ax=axs[0], x="decade", y="mean", hue="Type", data=df_female, hue_order=TYPE_ORDER, markers=True, dashes=True, style="Type", lw=3)
sns.lineplot(ax=axs[1], x="decade", y="mean", hue="Type", data=df_male, hue_order=TYPE_ORDER, markers=True, dashes=True,  style="Type", lw=3)

for jType in TYPE_ORDER:
    series_f = df_female.loc[df_female.Type == jType]
    series_m = df_male.loc[df_male.Type == jType]
    axs[0].fill_between(series_f["decade"], series_f["mean_ci_lower"], series_f["mean_ci_upper"], alpha=.2)
    axs[1].fill_between(series_m["decade"], series_m["mean_ci_lower"], series_m["mean_ci_upper"], alpha=.2)

handles, labels = axs[0].get_legend_handles_labels()
axs[0].legend(handles=handles[1:], labels=labels[1:], loc='upper center', bbox_to_anchor=(0.5, 1.25),
              fancybox=False, shadow=False, frameon=False, ncol=3)

axs[1].get_legend().remove()
axs[0].set(ylim=(0, 10), yticks=[0,5,10], ylabel="Estimated Average\n Number of Women")
axs[1].set(ylim=(0, 30), xlim=(1950, 2000), ylabel="Estimated Average\n Number of Men", xlabel="Decade")
fig.align_ylabels(axs[:])

plt.subplots_adjust(wspace=0, hspace=0)
fig.tight_layout()
plt.savefig(f"{IMAGES_FOLDER}Fig 13 - {model} Number Women vs Men.tif")
plt.show()


#%%% Graphs of GLM Model for AOS
model = "GLM_NB"
fig, ax = plt.subplots()
df = model_aos[ (model_aos.Model == model) & (model_aos.Type != "Interdisciplinary")].sort_values(by=["Type"], ascending=False).sort_values(by=['aos'])


df['aos'] = df['aos'].replace( to_replace= {'G': 'General\n Specialization (G)', 'H': 'Historical\n Philosophy (H)', 'LEMM': 'Language,\n Epistemology,\n Metaphysics,\n and Mind (LEMM)', 'S': 'Logic and \nPhilosophy of \nScience (S)', 'V':'Value Theory (V)'})

types = ["Top-Philosophy","Non-Top Philosophy"]

sns.barplot(ax=ax, x="aos", y="mean", hue="Type", data=df,  hue_order=types)
ax.set( ylabel="Proportion of Women Authorships (Estimated)", xlabel="", ylim=(0,0.52))

ax.axhline(y=0.245, xmin=0.05, xmax=0.19, color="#b319b3", linestyle="--", lw=4) # Faculty G
ax.axhline(y=0.297, xmin=0.25, xmax=0.39, color="#b319b3", linestyle="--", lw=4) # Faculty H
ax.axhline(y=0.196, xmin=0.42, xmax=0.58, color="#b319b3", linestyle="--", lw=4) # Faculty LEMM
ax.axhline(y=0.156, xmin=0.61, xmax=0.75, color="#b319b3", linestyle="--", lw=4) # Faculty S
ax.axhline(y=0.338, xmin=0.81, xmax=0.95, color="#b319b3", linestyle="--", lw=4) # Faculty V

handles, _ = ax.get_legend_handles_labels()
handles.append(Line2D([0], [0], color="#b319b3", linestyle="--",  label='Faculty (2014)'))
ax.legend(handles=handles, loc='upper center', bbox_to_anchor=(0.5, 1.15), fancybox=False,
          shadow=False, frameon=False, ncol=3)


#Hack to get error bars to print right on dodge bar plot
yerr = [df['mean'] - df['mean_ci_lower'], df['mean_ci_upper'] - df['mean']]
print("Warning!! using a workaround to generate CI on the bar plot, please double check numbers")
ax.errorbar([-0.2,0.2,1.2,1.8,2.2,2.8,3.8,4.2], df["mean"], yerr=yerr, fmt='none',  c= 'black',capsize=5)

df_sample_size = papers[(papers.Decade == 2000)].groupby(["AOS","Type"]).size()
print_sample_size(ax, df_sample_size, order=AOS_ORDER, hue_order=types, y_adjust=0.53, offset=0.35, x_adjust=0.2)

fig.tight_layout()
plt.savefig(f"{IMAGES_FOLDER}Fig 9 - {model} 2000s vs AOS Faculty 2014.tif")
plt.show()

#%%% Graphs of GLM Model for Review Type
model = "GLM_NB"
fig, ax = plt.subplots()
df = model_review[ (model_review.Model == model) ].copy()

df.loc[df.reviewtype == "Non-Anonymous", "sort"] = 1
df.loc[df.reviewtype == "Double Anonymous", "sort"] = 2
df.loc[df.reviewtype == "Triple Anonymous", "sort"] = 3
df = df.sort_values(by=["Type"], ascending=False).sort_values(by=["sort"])

sns.barplot(ax=ax, x="reviewtype", y="mean", hue="Type", data=df,  hue_order=TYPE_ORDER)
ax.set( ylabel="Proportion of Women Authorships (Estimated)", xlabel="Review Type")
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15), fancybox=False, shadow=False, frameon=False, ncol=3)

#Hack to get error bars to print right on dodge bar plot
yerr = [df['mean'] - df['mean_ci_lower'], df['mean_ci_upper'] - df['mean']]
print("Warning!! using a workaround to generate CI on the bar plot, please double check numbers")
ax.errorbar([-0.27,0,0.27,0.73,1,1.27,1.73,2.27], df["mean"], yerr=yerr, fmt='none',  c= 'black',capsize=5)

df_sample_size = papers[(papers.Decade == 2000)].groupby(["Review type","Type"]).size()
print_sample_size(ax, df_sample_size, order=REVIEW_ORDER, hue_order=TYPE_ORDER, y_adjust=0.53)

fig.tight_layout()
plt.savefig(f"{IMAGES_FOLDER}Fig 11 - {model} 2000s Review Type.tif")
plt.show()


#%% Plot All Humanities vs Phil
df_west_humanities = west_humanities.copy()
df_west_all = west_all_fields.groupby(["Decade"], as_index=False).sum()
df_phil_combined = journals_byYear.groupby(["Decade"], as_index=False).sum().copy()
calc( df_phil_combined, df_west_all)

fig,ax = plt.subplots()

ax.plot(df_phil_combined["Decade"], df_phil_combined["Proportion Female"], label="All Philosophy", color="blue", linestyle="-", marker="o")
ax.plot(df_west_all["Decade"], df_west_all["Proportion Female"], label="All Fields", color="green", linestyle=":", marker="s")
ax.plot(df_west_humanities["Decade"], df_west_humanities["Proportion Female"], label="Humanities", color="goldenrod", linestyle="--", marker="d")

ax.legend( loc='upper center', bbox_to_anchor=(0.5, 1.1), fancybox=False, shadow=False, frameon=False, ncol=3)
ax.set(xlim=(1900,2000), xticks=df_phil.Decade.unique(), ylabel="Proportion of Women Authorships")

fig.tight_layout()
plt.savefig(IMAGES_FOLDER + "Fig 12 - Humanities vs Phil.tif")
plt.show()


#%% Plot CDF of # authors in each journal year paper

fig, axs = plt.subplots(2)
fig.set_size_inches(9,7)

axs[0].set( ylabel="Probability", xlabel="Number of Papers in (Journal,Year) Pairs", xlim=(0,120))
axs[0].hist(papers.groupby(["jtitle","year"], as_index=False).count()["title"], bins=112, range=(0,150), density=True, cumulative=True, histtype='step')


axs[1].set( ylabel="Probability", xlabel="Number of Papers in (Journal,Decade) Pairs", xlim=(0,600))
axs[1].hist(papers[  (papers.Decade >= 1950)].groupby(["jtitle","Decade"], as_index=False).count()["title"], bins=595, range=(0,650),  density=True, cumulative=True, histtype='step')

plt.savefig(IMAGES_FOLDER + "Fig 14 - CDF Papers in Year Pairs.tif")
fig.tight_layout()
plt.show()

#%% Plot histogram  of # authors in each journal year paper
fig, axs = plt.subplots(2)
fig.set_size_inches(9,7)

axs[0].set( ylabel="Count", xlabel="Number of Papers in (Journal,Year) Pairs  1900-2009")
axs[0].hist(papers.groupby(["jtitle","year"], as_index=False).count()["title"], bins=100, label="Papers in journal in a year", range=(0,112), histtype='step')

axs[1].set( ylabel="Count", xlabel="Number of Papers in (Journal,Decade) Pairs 1900-2009")
axs[1].hist(papers.groupby(["jtitle","Decade"], as_index=False).count()["title"], bins=100, label="Papers in journal in a year", range=(0,595), histtype='step')

fig.tight_layout()
plt.savefig(IMAGES_FOLDER + "Fig 14 (alt) - Hist Papers in Year Pairs.tif")
plt.show()