notionparallax/gaps.py

## gaps.py
# %%
import textwrap
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np


# %%
def random_from_distribution(a, b, number_of_sigma=3, as_int=True):
    # calculate mean and standard deviation
    mu = (a + b) / 2
    sigma = abs(mu - a) / number_of_sigma

    # generate a random number from the normal distribution
    random_number = np.random.normal(mu, sigma)
    if as_int:
        return int(random_number)
    return random_number


# %% Balancing strategies
strategies = {
    "real_BVN_numbers": {
        "Prinicipal": {"m": 11, "f": 6, "abbr": "P", "s_low": 80, "s_high": 80},
        "Senior Practice Director": {
            "m": 16,
            "f": 7,
            "abbr": "SPD",
            "s_low": 150,
            "s_high": 250,
        },
        "Practice Director": {
            "m": 16,
            "f": 13,
            "abbr": "PD",
            "s_low": 130,
            "s_high": 190,
        },
        "Senior Associate": {
            "m": 21,
            "f": 22,
            "abbr": "SA",
            "s_low": 120,
            "s_high": 140,
        },
        "Interior Designer": {
            "m": 0,
            "f": 14,
            "abbr": "ID",
            "s_low": 80,
            "s_high": 120,
        },
        "Architect": {"m": 23, "f": 16, "abbr": "Arch", "s_low": 85, "s_high": 120},
        "Architecture Grad": {
            "m": 22,
            "f": 17,
            "abbr": "Grad",
            "s_low": 75,
            "s_high": 90,
        },
        "Student": {"m": 2, "f": 9, "abbr": "Student", "s_low": 65, "s_high": 80},
        "Other Streams": {
            "m": 21,
            "f": 33,
            "abbr": "Other",
            "s_low": 70,
            "s_high": 170,
        },
        "BIM Manager": {"m": 5, "f": 1, "abbr": "BM", "s_low": 110, "s_high": 150},
        "Technical": {"m": 8, "f": 4, "abbr": "T", "s_low": 100, "s_high": 200},
    },
    "balanced_numbers": {
        "Prinicipal": {"m": 8, "f": 8, "abbr": "P", "s_low": 80, "s_high": 80},
        "Senior Practice Director": {
            "m": 11,
            "f": 11,
            "abbr": "SPD",
            "s_low": 150,
            "s_high": 250,
        },
        "Practice Director": {
            "m": 15,
            "f": 15,
            "abbr": "PD",
            "s_low": 130,
            "s_high": 190,
        },
        "Senior Associate": {
            "m": 21,
            "f": 21,
            "abbr": "SA",
            "s_low": 120,
            "s_high": 140,
        },
        "Interior Designer": {"m": 7, "f": 7, "abbr": "ID", "s_low": 80, "s_high": 120},
        "Architect": {"m": 20, "f": 20, "abbr": "Arch", "s_low": 85, "s_high": 120},
        "Architecture Grad": {
            "m": 19,
            "f": 19,
            "abbr": "Grad",
            "s_low": 75,
            "s_high": 90,
        },
        "Student": {"m": 5, "f": 5, "abbr": "Student", "s_low": 65, "s_high": 80},
        "Other Streams": {
            "m": 26,
            "f": 26,
            "abbr": "Other",
            "s_low": 70,
            "s_high": 170,
        },
        "BIM Manager": {"m": 3, "f": 3, "abbr": "BM", "s_low": 110, "s_high": 150},
        "Technical": {"m": 6, "f": 6, "abbr": "T", "s_low": 100, "s_high": 200},
    },
    "male_students": {
        "Prinicipal": {"m": 11, "f": 6, "abbr": "P", "s_low": 80, "s_high": 80},
        "Senior Practice Director": {
            "m": 16,
            "f": 7,
            "abbr": "SPD",
            "s_low": 150,
            "s_high": 250,
        },
        "Practice Director": {
            "m": 16,
            "f": 13,
            "abbr": "PD",
            "s_low": 130,
            "s_high": 190,
        },
        "Senior Associate": {
            "m": 21,
            "f": 22,
            "abbr": "SA",
            "s_low": 120,
            "s_high": 140,
        },
        "Interior Designer": {
            "m": 0,
            "f": 14,
            "abbr": "ID",
            "s_low": 80,
            "s_high": 120,
        },
        "Architect": {"m": 23, "f": 16, "abbr": "Arch", "s_low": 85, "s_high": 120},
        "Architecture Grad": {
            "m": 22,
            "f": 17,
            "abbr": "Grad",
            "s_low": 75,
            "s_high": 90,
        },
        "Student": {"m": 16, "f": 0, "abbr": "Student", "s_low": 65, "s_high": 80},
        "Other Streams": {
            "m": 21,
            "f": 33,
            "abbr": "Other",
            "s_low": 70,
            "s_high": 170,
        },
        "BIM Manager": {"m": 5, "f": 1, "abbr": "BM", "s_low": 110, "s_high": 150},
        "Technical": {"m": 8, "f": 4, "abbr": "T", "s_low": 100, "s_high": 200},
    },
    "fire_the_boys": {
        "Prinicipal": {"m": 11, "f": 6, "abbr": "P", "s_low": 80, "s_high": 80},
        "Senior Practice Director": {
            "m": 10,
            "f": 13,
            "abbr": "SPD",
            "s_low": 150,
            "s_high": 250,
        },
        "Practice Director": {
            "m": 16,
            "f": 13,
            "abbr": "PD",
            "s_low": 130,
            "s_high": 190,
        },
        "Senior Associate": {
            "m": 21,
            "f": 22,
            "abbr": "SA",
            "s_low": 120,
            "s_high": 140,
        },
        "Interior Designer": {
            "m": 0,
            "f": 14,
            "abbr": "ID",
            "s_low": 80,
            "s_high": 120,
        },
        "Architect": {"m": 23, "f": 16, "abbr": "Arch", "s_low": 85, "s_high": 120},
        "Architecture Grad": {
            "m": 22,
            "f": 17,
            "abbr": "Grad",
            "s_low": 75,
            "s_high": 90,
        },
        "Student": {"m": 2, "f": 9, "abbr": "Student", "s_low": 65, "s_high": 80},
        "Other Streams": {
            "m": 21,
            "f": 33,
            "abbr": "Other",
            "s_low": 70,
            "s_high": 170,
        },
        "BIM Manager": {"m": 5, "f": 1, "abbr": "BM", "s_low": 110, "s_high": 150},
        "Technical": {"m": 8, "f": 4, "abbr": "T", "s_low": 100, "s_high": 200},
    },
}
people_meta = strategies["balanced_numbers"]


# %%
def make_people_data():
    """Convert the summary data (above) into a list of people's salaries."""
    people = []
    for rank, data in people_meta.items():
        for sex in ["m", "f"]:
            for _ in range(data[sex]):
                people.append(
                    {
                        "rank": rank,
                        "sex": sex,
                        "salary": random_from_distribution(
                            data["s_low"], data["s_high"]
                        ),
                        "abbr": data["abbr"],
                    }
                )
    p_df = pd.DataFrame(people)
    return p_df


people_df = make_people_data()


# %%
# calculate mean salary for each gender within each rank
mean_salary = people_df.groupby(["rank", "sex"]).agg(
    Mean=("salary", np.mean), Median=("salary", np.median)
)
# unstack the multi-index series to a dataframe
summary_salary_df = mean_salary.unstack()
# calculate the percentage difference for each rank
summary_salary_df["pc_diff_mean"] = (
    (summary_salary_df["Mean"]["m"] - summary_salary_df["Mean"]["f"])
    / summary_salary_df["Mean"]["f"]
) * 100
summary_salary_df["pc_diff_median"] = (
    (summary_salary_df["Median"]["m"] - summary_salary_df["Median"]["f"])
    / summary_salary_df["Median"]["f"]
) * 100
# %%
summary_salary_df


# %%
def mean_and_median_delta(df):
    m_f_df_mean = df.groupby("sex").salary.mean()
    m_f_df_median = df.groupby("sex").salary.median()
    pc_delta_mean = (
        (m_f_df_mean.loc["m"] - m_f_df_mean.loc["f"]) / m_f_df_mean.loc["f"]
    ) * 100
    pc_delta_median = (
        (m_f_df_median.loc["m"] - m_f_df_median.loc["f"]) / m_f_df_median.loc["f"]
    ) * 100
    return pc_delta_mean, pc_delta_median


pc_delta_mean, pc_delta_median = mean_and_median_delta(people_df)
print(
    f"""
The mean salary gap for this cohort is   {round(pc_delta_mean, 2)}%.
The median salary gap for this cohort is {round(pc_delta_median, 2)}%."""
)


# %%
def draw_v_plot(mean_or_median="mean"):
    """Draw a Violin Plot of gender pay gap segregated by title."""
    if mean_or_median == "mean":
        summary_column = "Mean"
        palette = "summer"
    elif mean_or_median == "median":
        summary_column = "Median"
        palette = "spring"
    else:
        print("we're only supporting [mean|median] at the moment, use one of those two")
        return

    fig, ax = plt.subplots(figsize=(16, 9))
    sns.set_theme(style="whitegrid")
    sns.violinplot(
        ax=ax,
        data=people_df,
        x="rank",
        y="salary",
        hue="sex",
        palette=palette,
        split=True,
    )
    ax.set(
        xlabel="",
        ylabel="Salary ($)",
        title=(
            "Salary distributuion violin plot, grouped by title and sex\n"
            "Overall gender pay gap for this group is "
            f"{round(pc_delta_mean, 2)}% (mean) and {round(pc_delta_median, 2)}% (median).\n"
            f"Using {summary_column} as summary method"
        ),
        ylim=(0, 275),
    )

    wrap_point = 12
    for name, d in summary_salary_df.iterrows():

        mx = round(d[summary_column]["m"], 1)
        fx = round(d[summary_column]["f"], 1)
        delta_pc_mean = round(d["pc_diff_mean"][0], 1)
        delta_pc_median = round(d["pc_diff_median"][0], 1)
        rank_name = textwrap.fill(name, wrap_point)

        ax.text(
            [tl.get_text() for tl in ax.get_xticklabels()].index(name),
            5,
            f"""{rank_name}
mX̄: ${mx}
fX̄: ${fx}
X̄∆: {delta_pc_mean}%
x͂∆: {delta_pc_median}%""",
            fontsize=10,
            ha="left",
        )

    ax.set_xticklabels(
        [textwrap.fill(t.get_text(), wrap_point) for t in ax.get_xticklabels()]
    )
    sns.despine()
    plt.tight_layout()
    plt.show()


draw_v_plot(mean_or_median="mean")
# %%
draw_v_plot(mean_or_median="median")
# %%
# TODO: filter out rank|sex that have fewer than N members. E.g. don't show the salary of the one female bim manager
# TODO: Move the summary text over to the left. I'm not sure if it can be aligned on the colon
# TODO: align the colours to the brand colours
# ✔ TODO: show overall sex pay gap
# TODO: show mean and median on the same hist


# %%
def gap_distribution(runs=10000):
    """Run a simulation of this company's data `runs` times.

    Return a list of pay gap percentages."""
    gap_list_mean = []
    gap_list_median = []
    for _ in range(runs):
        p_df = make_people_data()
        pc_delta_mean, pc_delta_median = mean_and_median_delta(p_df)
        gap_list_mean.append(pc_delta_mean)
        gap_list_median.append(pc_delta_median)
    return gap_list_mean, gap_list_median


def draw_histogram(gap_list_mean, gap_list_median, sim_run_count=10000):
    fig, ax = plt.subplots(figsize=(16, 9))
    ax.hist(gap_list_mean, bins=100, histtype="step")
    ax.hist(gap_list_median, bins=100, histtype="step")
    # pd.Series(gap_list_mean).plot(kind='density')
    # pd.Series(gap_list_median).plot(kind='density')
    ax.set(
        xlabel="% pay gap, +ve favours men",
        ylabel="Number of sim runs with this %",
        title=f"Distribution of gender pay gap over {sim_run_count} simulations",
    )


# %%
gap_list_mean, gap_list_median = gap_distribution(runs=10000)
# %%
draw_histogram(gap_list_mean, gap_list_median, sim_run_count=10000)

# %%
	# %%
	import textwrap
	import matplotlib.pyplot as plt
	import seaborn as sns
	import pandas as pd
	import numpy as np


	# %%
	def random_from_distribution(a, b, number_of_sigma=3, as_int=True):
	# calculate mean and standard deviation
	mu = (a + b) / 2
	sigma = abs(mu - a) / number_of_sigma

	# generate a random number from the normal distribution
	random_number = np.random.normal(mu, sigma)
	if as_int:
	return int(random_number)
	return random_number


	# %% Balancing strategies
	strategies = {
	"real_BVN_numbers": {
	"Prinicipal": {"m": 11, "f": 6, "abbr": "P", "s_low": 80, "s_high": 80},
	"Senior Practice Director": {
	"m": 16,
	"f": 7,
	"abbr": "SPD",
	"s_low": 150,
	"s_high": 250,
	},
	"Practice Director": {
	"m": 16,
	"f": 13,
	"abbr": "PD",
	"s_low": 130,
	"s_high": 190,
	},
	"Senior Associate": {
	"m": 21,
	"f": 22,
	"abbr": "SA",
	"s_low": 120,
	"s_high": 140,
	},
	"Interior Designer": {
	"m": 0,
	"f": 14,
	"abbr": "ID",
	"s_low": 80,
	"s_high": 120,
	},
	"Architect": {"m": 23, "f": 16, "abbr": "Arch", "s_low": 85, "s_high": 120},
	"Architecture Grad": {
	"m": 22,
	"f": 17,
	"abbr": "Grad",
	"s_low": 75,
	"s_high": 90,
	},
	"Student": {"m": 2, "f": 9, "abbr": "Student", "s_low": 65, "s_high": 80},
	"Other Streams": {
	"m": 21,
	"f": 33,
	"abbr": "Other",
	"s_low": 70,
	"s_high": 170,
	},
	"BIM Manager": {"m": 5, "f": 1, "abbr": "BM", "s_low": 110, "s_high": 150},
	"Technical": {"m": 8, "f": 4, "abbr": "T", "s_low": 100, "s_high": 200},
	},
	"balanced_numbers": {
	"Prinicipal": {"m": 8, "f": 8, "abbr": "P", "s_low": 80, "s_high": 80},
	"Senior Practice Director": {
	"m": 11,
	"f": 11,
	"abbr": "SPD",
	"s_low": 150,
	"s_high": 250,
	},
	"Practice Director": {
	"m": 15,
	"f": 15,
	"abbr": "PD",
	"s_low": 130,
	"s_high": 190,
	},
	"Senior Associate": {
	"m": 21,
	"f": 21,
	"abbr": "SA",
	"s_low": 120,
	"s_high": 140,
	},
	"Interior Designer": {"m": 7, "f": 7, "abbr": "ID", "s_low": 80, "s_high": 120},
	"Architect": {"m": 20, "f": 20, "abbr": "Arch", "s_low": 85, "s_high": 120},
	"Architecture Grad": {
	"m": 19,
	"f": 19,
	"abbr": "Grad",
	"s_low": 75,
	"s_high": 90,
	},
	"Student": {"m": 5, "f": 5, "abbr": "Student", "s_low": 65, "s_high": 80},
	"Other Streams": {
	"m": 26,
	"f": 26,
	"abbr": "Other",
	"s_low": 70,
	"s_high": 170,
	},
	"BIM Manager": {"m": 3, "f": 3, "abbr": "BM", "s_low": 110, "s_high": 150},
	"Technical": {"m": 6, "f": 6, "abbr": "T", "s_low": 100, "s_high": 200},
	},
	"male_students": {
	"Prinicipal": {"m": 11, "f": 6, "abbr": "P", "s_low": 80, "s_high": 80},
	"Senior Practice Director": {
	"m": 16,
	"f": 7,
	"abbr": "SPD",
	"s_low": 150,
	"s_high": 250,
	},
	"Practice Director": {
	"m": 16,
	"f": 13,
	"abbr": "PD",
	"s_low": 130,
	"s_high": 190,
	},
	"Senior Associate": {
	"m": 21,
	"f": 22,
	"abbr": "SA",
	"s_low": 120,
	"s_high": 140,
	},
	"Interior Designer": {
	"m": 0,
	"f": 14,
	"abbr": "ID",
	"s_low": 80,
	"s_high": 120,
	},
	"Architect": {"m": 23, "f": 16, "abbr": "Arch", "s_low": 85, "s_high": 120},
	"Architecture Grad": {
	"m": 22,
	"f": 17,
	"abbr": "Grad",
	"s_low": 75,
	"s_high": 90,
	},
	"Student": {"m": 16, "f": 0, "abbr": "Student", "s_low": 65, "s_high": 80},
	"Other Streams": {
	"m": 21,
	"f": 33,
	"abbr": "Other",
	"s_low": 70,
	"s_high": 170,
	},
	"BIM Manager": {"m": 5, "f": 1, "abbr": "BM", "s_low": 110, "s_high": 150},
	"Technical": {"m": 8, "f": 4, "abbr": "T", "s_low": 100, "s_high": 200},
	},
	"fire_the_boys": {
	"Prinicipal": {"m": 11, "f": 6, "abbr": "P", "s_low": 80, "s_high": 80},
	"Senior Practice Director": {
	"m": 10,
	"f": 13,
	"abbr": "SPD",
	"s_low": 150,
	"s_high": 250,
	},
	"Practice Director": {
	"m": 16,
	"f": 13,
	"abbr": "PD",
	"s_low": 130,
	"s_high": 190,
	},
	"Senior Associate": {
	"m": 21,
	"f": 22,
	"abbr": "SA",
	"s_low": 120,
	"s_high": 140,
	},
	"Interior Designer": {
	"m": 0,
	"f": 14,
	"abbr": "ID",
	"s_low": 80,
	"s_high": 120,
	},
	"Architect": {"m": 23, "f": 16, "abbr": "Arch", "s_low": 85, "s_high": 120},
	"Architecture Grad": {
	"m": 22,
	"f": 17,
	"abbr": "Grad",
	"s_low": 75,
	"s_high": 90,
	},
	"Student": {"m": 2, "f": 9, "abbr": "Student", "s_low": 65, "s_high": 80},
	"Other Streams": {
	"m": 21,
	"f": 33,
	"abbr": "Other",
	"s_low": 70,
	"s_high": 170,
	},
	"BIM Manager": {"m": 5, "f": 1, "abbr": "BM", "s_low": 110, "s_high": 150},
	"Technical": {"m": 8, "f": 4, "abbr": "T", "s_low": 100, "s_high": 200},
	},
	}
	people_meta = strategies["balanced_numbers"]


	# %%
	def make_people_data():
	"""Convert the summary data (above) into a list of people's salaries."""
	people = []
	for rank, data in people_meta.items():
	for sex in ["m", "f"]:
	for _ in range(data[sex]):
	people.append(
	{
	"rank": rank,
	"sex": sex,
	"salary": random_from_distribution(
	data["s_low"], data["s_high"]
	),
	"abbr": data["abbr"],
	}
	)
	p_df = pd.DataFrame(people)
	return p_df


	people_df = make_people_data()


	# %%
	# calculate mean salary for each gender within each rank
	mean_salary = people_df.groupby(["rank", "sex"]).agg(
	Mean=("salary", np.mean), Median=("salary", np.median)
	)
	# unstack the multi-index series to a dataframe
	summary_salary_df = mean_salary.unstack()
	# calculate the percentage difference for each rank
	summary_salary_df["pc_diff_mean"] = (
	(summary_salary_df["Mean"]["m"] - summary_salary_df["Mean"]["f"])
	/ summary_salary_df["Mean"]["f"]
	) * 100
	summary_salary_df["pc_diff_median"] = (
	(summary_salary_df["Median"]["m"] - summary_salary_df["Median"]["f"])
	/ summary_salary_df["Median"]["f"]
	) * 100
	# %%
	summary_salary_df


	# %%
	def mean_and_median_delta(df):
	m_f_df_mean = df.groupby("sex").salary.mean()
	m_f_df_median = df.groupby("sex").salary.median()
	pc_delta_mean = (
	(m_f_df_mean.loc["m"] - m_f_df_mean.loc["f"]) / m_f_df_mean.loc["f"]
	) * 100
	pc_delta_median = (
	(m_f_df_median.loc["m"] - m_f_df_median.loc["f"]) / m_f_df_median.loc["f"]
	) * 100
	return pc_delta_mean, pc_delta_median


	pc_delta_mean, pc_delta_median = mean_and_median_delta(people_df)
	print(
	f"""
	The mean salary gap for this cohort is {round(pc_delta_mean, 2)}%.
	The median salary gap for this cohort is {round(pc_delta_median, 2)}%."""
	)


	# %%
	def draw_v_plot(mean_or_median="mean"):
	"""Draw a Violin Plot of gender pay gap segregated by title."""
	if mean_or_median == "mean":
	summary_column = "Mean"
	palette = "summer"
	elif mean_or_median == "median":
	summary_column = "Median"
	palette = "spring"
	else:
	print("we're only supporting [mean\|median] at the moment, use one of those two")
	return

	fig, ax = plt.subplots(figsize=(16, 9))
	sns.set_theme(style="whitegrid")
	sns.violinplot(
	ax=ax,
	data=people_df,
	x="rank",
	y="salary",
	hue="sex",
	palette=palette,
	split=True,
	)
	ax.set(
	xlabel="",
	ylabel="Salary ($)",
	title=(
	"Salary distributuion violin plot, grouped by title and sex\n"
	"Overall gender pay gap for this group is "
	f"{round(pc_delta_mean, 2)}% (mean) and {round(pc_delta_median, 2)}% (median).\n"
	f"Using {summary_column} as summary method"
	),
	ylim=(0, 275),
	)

	wrap_point = 12
	for name, d in summary_salary_df.iterrows():

	mx = round(d[summary_column]["m"], 1)
	fx = round(d[summary_column]["f"], 1)
	delta_pc_mean = round(d["pc_diff_mean"][0], 1)
	delta_pc_median = round(d["pc_diff_median"][0], 1)
	rank_name = textwrap.fill(name, wrap_point)

	ax.text(
	[tl.get_text() for tl in ax.get_xticklabels()].index(name),
	5,
	f"""{rank_name}
	mX̄: ${mx}
	fX̄: ${fx}
	X̄∆: {delta_pc_mean}%
	x͂∆: {delta_pc_median}%""",
	fontsize=10,
	ha="left",
	)

	ax.set_xticklabels(
	[textwrap.fill(t.get_text(), wrap_point) for t in ax.get_xticklabels()]
	)
	sns.despine()
	plt.tight_layout()
	plt.show()


	draw_v_plot(mean_or_median="mean")
	# %%
	draw_v_plot(mean_or_median="median")
	# %%
	# TODO: filter out rank\|sex that have fewer than N members. E.g. don't show the salary of the one female bim manager
	# TODO: Move the summary text over to the left. I'm not sure if it can be aligned on the colon
	# TODO: align the colours to the brand colours
	# ✔ TODO: show overall sex pay gap
	# TODO: show mean and median on the same hist


	# %%
	def gap_distribution(runs=10000):
	"""Run a simulation of this company's data `runs` times.

	Return a list of pay gap percentages."""
	gap_list_mean = []
	gap_list_median = []
	for _ in range(runs):
	p_df = make_people_data()
	pc_delta_mean, pc_delta_median = mean_and_median_delta(p_df)
	gap_list_mean.append(pc_delta_mean)
	gap_list_median.append(pc_delta_median)
	return gap_list_mean, gap_list_median


	def draw_histogram(gap_list_mean, gap_list_median, sim_run_count=10000):
	fig, ax = plt.subplots(figsize=(16, 9))
	ax.hist(gap_list_mean, bins=100, histtype="step")
	ax.hist(gap_list_median, bins=100, histtype="step")
	# pd.Series(gap_list_mean).plot(kind='density')
	# pd.Series(gap_list_median).plot(kind='density')
	ax.set(
	xlabel="% pay gap, +ve favours men",
	ylabel="Number of sim runs with this %",
	title=f"Distribution of gender pay gap over {sim_run_count} simulations",
	)


	# %%
	gap_list_mean, gap_list_median = gap_distribution(runs=10000)
	# %%
	draw_histogram(gap_list_mean, gap_list_median, sim_run_count=10000)

	# %%