Skip to content

Instantly share code, notes, and snippets.

@notionparallax
Created March 1, 2024 03:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save notionparallax/be5161be654a2558e06f79639b2e4cf7 to your computer and use it in GitHub Desktop.
Save notionparallax/be5161be654a2558e06f79639b2e4cf7 to your computer and use it in GitHub Desktop.
# %%
import textwrap
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
# %%
def random_from_distribution(a, b, number_of_sigma=3, as_int=True):
# calculate mean and standard deviation
mu = (a + b) / 2
sigma = abs(mu - a) / number_of_sigma
# generate a random number from the normal distribution
random_number = np.random.normal(mu, sigma)
if as_int:
return int(random_number)
return random_number
# %% Balancing strategies
strategies = {
"real_BVN_numbers": {
"Prinicipal": {"m": 11, "f": 6, "abbr": "P", "s_low": 80, "s_high": 80},
"Senior Practice Director": {
"m": 16,
"f": 7,
"abbr": "SPD",
"s_low": 150,
"s_high": 250,
},
"Practice Director": {
"m": 16,
"f": 13,
"abbr": "PD",
"s_low": 130,
"s_high": 190,
},
"Senior Associate": {
"m": 21,
"f": 22,
"abbr": "SA",
"s_low": 120,
"s_high": 140,
},
"Interior Designer": {
"m": 0,
"f": 14,
"abbr": "ID",
"s_low": 80,
"s_high": 120,
},
"Architect": {"m": 23, "f": 16, "abbr": "Arch", "s_low": 85, "s_high": 120},
"Architecture Grad": {
"m": 22,
"f": 17,
"abbr": "Grad",
"s_low": 75,
"s_high": 90,
},
"Student": {"m": 2, "f": 9, "abbr": "Student", "s_low": 65, "s_high": 80},
"Other Streams": {
"m": 21,
"f": 33,
"abbr": "Other",
"s_low": 70,
"s_high": 170,
},
"BIM Manager": {"m": 5, "f": 1, "abbr": "BM", "s_low": 110, "s_high": 150},
"Technical": {"m": 8, "f": 4, "abbr": "T", "s_low": 100, "s_high": 200},
},
"balanced_numbers": {
"Prinicipal": {"m": 8, "f": 8, "abbr": "P", "s_low": 80, "s_high": 80},
"Senior Practice Director": {
"m": 11,
"f": 11,
"abbr": "SPD",
"s_low": 150,
"s_high": 250,
},
"Practice Director": {
"m": 15,
"f": 15,
"abbr": "PD",
"s_low": 130,
"s_high": 190,
},
"Senior Associate": {
"m": 21,
"f": 21,
"abbr": "SA",
"s_low": 120,
"s_high": 140,
},
"Interior Designer": {"m": 7, "f": 7, "abbr": "ID", "s_low": 80, "s_high": 120},
"Architect": {"m": 20, "f": 20, "abbr": "Arch", "s_low": 85, "s_high": 120},
"Architecture Grad": {
"m": 19,
"f": 19,
"abbr": "Grad",
"s_low": 75,
"s_high": 90,
},
"Student": {"m": 5, "f": 5, "abbr": "Student", "s_low": 65, "s_high": 80},
"Other Streams": {
"m": 26,
"f": 26,
"abbr": "Other",
"s_low": 70,
"s_high": 170,
},
"BIM Manager": {"m": 3, "f": 3, "abbr": "BM", "s_low": 110, "s_high": 150},
"Technical": {"m": 6, "f": 6, "abbr": "T", "s_low": 100, "s_high": 200},
},
"male_students": {
"Prinicipal": {"m": 11, "f": 6, "abbr": "P", "s_low": 80, "s_high": 80},
"Senior Practice Director": {
"m": 16,
"f": 7,
"abbr": "SPD",
"s_low": 150,
"s_high": 250,
},
"Practice Director": {
"m": 16,
"f": 13,
"abbr": "PD",
"s_low": 130,
"s_high": 190,
},
"Senior Associate": {
"m": 21,
"f": 22,
"abbr": "SA",
"s_low": 120,
"s_high": 140,
},
"Interior Designer": {
"m": 0,
"f": 14,
"abbr": "ID",
"s_low": 80,
"s_high": 120,
},
"Architect": {"m": 23, "f": 16, "abbr": "Arch", "s_low": 85, "s_high": 120},
"Architecture Grad": {
"m": 22,
"f": 17,
"abbr": "Grad",
"s_low": 75,
"s_high": 90,
},
"Student": {"m": 16, "f": 0, "abbr": "Student", "s_low": 65, "s_high": 80},
"Other Streams": {
"m": 21,
"f": 33,
"abbr": "Other",
"s_low": 70,
"s_high": 170,
},
"BIM Manager": {"m": 5, "f": 1, "abbr": "BM", "s_low": 110, "s_high": 150},
"Technical": {"m": 8, "f": 4, "abbr": "T", "s_low": 100, "s_high": 200},
},
"fire_the_boys": {
"Prinicipal": {"m": 11, "f": 6, "abbr": "P", "s_low": 80, "s_high": 80},
"Senior Practice Director": {
"m": 10,
"f": 13,
"abbr": "SPD",
"s_low": 150,
"s_high": 250,
},
"Practice Director": {
"m": 16,
"f": 13,
"abbr": "PD",
"s_low": 130,
"s_high": 190,
},
"Senior Associate": {
"m": 21,
"f": 22,
"abbr": "SA",
"s_low": 120,
"s_high": 140,
},
"Interior Designer": {
"m": 0,
"f": 14,
"abbr": "ID",
"s_low": 80,
"s_high": 120,
},
"Architect": {"m": 23, "f": 16, "abbr": "Arch", "s_low": 85, "s_high": 120},
"Architecture Grad": {
"m": 22,
"f": 17,
"abbr": "Grad",
"s_low": 75,
"s_high": 90,
},
"Student": {"m": 2, "f": 9, "abbr": "Student", "s_low": 65, "s_high": 80},
"Other Streams": {
"m": 21,
"f": 33,
"abbr": "Other",
"s_low": 70,
"s_high": 170,
},
"BIM Manager": {"m": 5, "f": 1, "abbr": "BM", "s_low": 110, "s_high": 150},
"Technical": {"m": 8, "f": 4, "abbr": "T", "s_low": 100, "s_high": 200},
},
}
people_meta = strategies["balanced_numbers"]
# %%
def make_people_data():
"""Convert the summary data (above) into a list of people's salaries."""
people = []
for rank, data in people_meta.items():
for sex in ["m", "f"]:
for _ in range(data[sex]):
people.append(
{
"rank": rank,
"sex": sex,
"salary": random_from_distribution(
data["s_low"], data["s_high"]
),
"abbr": data["abbr"],
}
)
p_df = pd.DataFrame(people)
return p_df
people_df = make_people_data()
# %%
# calculate mean salary for each gender within each rank
mean_salary = people_df.groupby(["rank", "sex"]).agg(
Mean=("salary", np.mean), Median=("salary", np.median)
)
# unstack the multi-index series to a dataframe
summary_salary_df = mean_salary.unstack()
# calculate the percentage difference for each rank
summary_salary_df["pc_diff_mean"] = (
(summary_salary_df["Mean"]["m"] - summary_salary_df["Mean"]["f"])
/ summary_salary_df["Mean"]["f"]
) * 100
summary_salary_df["pc_diff_median"] = (
(summary_salary_df["Median"]["m"] - summary_salary_df["Median"]["f"])
/ summary_salary_df["Median"]["f"]
) * 100
# %%
summary_salary_df
# %%
def mean_and_median_delta(df):
m_f_df_mean = df.groupby("sex").salary.mean()
m_f_df_median = df.groupby("sex").salary.median()
pc_delta_mean = (
(m_f_df_mean.loc["m"] - m_f_df_mean.loc["f"]) / m_f_df_mean.loc["f"]
) * 100
pc_delta_median = (
(m_f_df_median.loc["m"] - m_f_df_median.loc["f"]) / m_f_df_median.loc["f"]
) * 100
return pc_delta_mean, pc_delta_median
pc_delta_mean, pc_delta_median = mean_and_median_delta(people_df)
print(
f"""
The mean salary gap for this cohort is {round(pc_delta_mean, 2)}%.
The median salary gap for this cohort is {round(pc_delta_median, 2)}%."""
)
# %%
def draw_v_plot(mean_or_median="mean"):
"""Draw a Violin Plot of gender pay gap segregated by title."""
if mean_or_median == "mean":
summary_column = "Mean"
palette = "summer"
elif mean_or_median == "median":
summary_column = "Median"
palette = "spring"
else:
print("we're only supporting [mean|median] at the moment, use one of those two")
return
fig, ax = plt.subplots(figsize=(16, 9))
sns.set_theme(style="whitegrid")
sns.violinplot(
ax=ax,
data=people_df,
x="rank",
y="salary",
hue="sex",
palette=palette,
split=True,
)
ax.set(
xlabel="",
ylabel="Salary ($)",
title=(
"Salary distributuion violin plot, grouped by title and sex\n"
"Overall gender pay gap for this group is "
f"{round(pc_delta_mean, 2)}% (mean) and {round(pc_delta_median, 2)}% (median).\n"
f"Using {summary_column} as summary method"
),
ylim=(0, 275),
)
wrap_point = 12
for name, d in summary_salary_df.iterrows():
mx = round(d[summary_column]["m"], 1)
fx = round(d[summary_column]["f"], 1)
delta_pc_mean = round(d["pc_diff_mean"][0], 1)
delta_pc_median = round(d["pc_diff_median"][0], 1)
rank_name = textwrap.fill(name, wrap_point)
ax.text(
[tl.get_text() for tl in ax.get_xticklabels()].index(name),
5,
f"""{rank_name}
mX̄: ${mx}
fX̄: ${fx}
X̄∆: {delta_pc_mean}%
x͂∆: {delta_pc_median}%""",
fontsize=10,
ha="left",
)
ax.set_xticklabels(
[textwrap.fill(t.get_text(), wrap_point) for t in ax.get_xticklabels()]
)
sns.despine()
plt.tight_layout()
plt.show()
draw_v_plot(mean_or_median="mean")
# %%
draw_v_plot(mean_or_median="median")
# %%
# TODO: filter out rank|sex that have fewer than N members. E.g. don't show the salary of the one female bim manager
# TODO: Move the summary text over to the left. I'm not sure if it can be aligned on the colon
# TODO: align the colours to the brand colours
# ✔ TODO: show overall sex pay gap
# TODO: show mean and median on the same hist
# %%
def gap_distribution(runs=10000):
"""Run a simulation of this company's data `runs` times.
Return a list of pay gap percentages."""
gap_list_mean = []
gap_list_median = []
for _ in range(runs):
p_df = make_people_data()
pc_delta_mean, pc_delta_median = mean_and_median_delta(p_df)
gap_list_mean.append(pc_delta_mean)
gap_list_median.append(pc_delta_median)
return gap_list_mean, gap_list_median
def draw_histogram(gap_list_mean, gap_list_median, sim_run_count=10000):
fig, ax = plt.subplots(figsize=(16, 9))
ax.hist(gap_list_mean, bins=100, histtype="step")
ax.hist(gap_list_median, bins=100, histtype="step")
# pd.Series(gap_list_mean).plot(kind='density')
# pd.Series(gap_list_median).plot(kind='density')
ax.set(
xlabel="% pay gap, +ve favours men",
ylabel="Number of sim runs with this %",
title=f"Distribution of gender pay gap over {sim_run_count} simulations",
)
# %%
gap_list_mean, gap_list_median = gap_distribution(runs=10000)
# %%
draw_histogram(gap_list_mean, gap_list_median, sim_run_count=10000)
# %%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment