Skip to content

Instantly share code, notes, and snippets.

@sidravi1
Last active May 21, 2020 10:35
Show Gist options
  • Save sidravi1/86877a58bf6d534872311e08200177dc to your computer and use it in GitHub Desktop.
Save sidravi1/86877a58bf6d534872311e08200177dc to your computer and use it in GitHub Desktop.
Expected utility vs. prediction probs
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
def get_data():
cancer_dict = load_breast_cancer()
cancer_dict.keys()
X = pd.DataFrame(cancer_dict["data"], columns=cancer_dict["feature_names"])
y = cancer_dict["target"]
X.rename({"worst concave points": "loan_amount"}, inplace=True, axis=1)
return train_test_split(X, y)
def get_probability_of_default(X_train, X_test, y_train, y_test):
clf = LogisticRegression(max_iter=10000).fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)[:, 1]
return y_pred
def get_utlity(n_items, label, default_prob, loan_amount, interest=0.1):
utility = (
default_prob * (-loan_amount) + (1 - default_prob) * interest * loan_amount
)
actual_utility = label * (-loan_amount) + (1 - label) * interest * loan_amount
pred_df = pd.DataFrame(
data=np.vstack([label, utility, default_prob, loan_amount, actual_utility]).T,
columns=["label", "utility", "prediction", "loan_amount", "actual_utility"],
)
top_weighted = pred_df.sort_values("utility")[-n_items:] # pick highest utility
weighted_value = top_weighted["actual_utility"].sum()
top = pred_df.sort_values("prediction")[:n_items] # pick lowest chance of default
value = top["actual_utility"].sum()
return pred_df, {"weighted_value": weighted_value, "unweighted_value": value}
def run_simulation(n_sims, noise_alpha=0.5, interest=0.1):
all_utilities = []
datasets = get_data()
y_pred = get_probability_of_default(*datasets)
for sim in range(n_sims):
y_pred_noised = (
y_pred + np.random.beta(noise_alpha, noise_alpha, size=y_pred.shape[0])
) / 2
df = pd.DataFrame(
[
get_utlity(
n_items,
datasets[3],
y_pred_noised,
datasets[1]["loan_amount"],
interest,
)[1]
for n_items in np.arange(1, datasets[3].shape[0] + 1)
]
)
df["n_selected"] = np.arange(1, datasets[3].shape[0] + 1)
df["sim"] = sim
all_utilities.append(df)
all_utilities_df = pd.concat(all_utilities, ignore_index=True)
return all_utilities_df
if __name__ == "__main__":
n_sim = 100
noise_alpha = 0.5
interest = 0.3
all_utilities_df = run_simulation(n_sim, noise_alpha=0.5, interest=0.3)
f = plt.figure(figsize=(7, 5))
for sim in range(n_sim):
df = all_utilities_df[all_utilities_df["sim"] == sim]
plt.plot(
df["n_selected"],
df["weighted_value"],
color="dodgerblue",
alpha=0.2,
lw=0.5,
)
plt.plot(
df["n_selected"], df["unweighted_value"], color="pink", alpha=0.3, lw=0.5
)
mean_df = all_utilities_df.groupby(["n_selected"]).mean().reset_index()
plt.plot(
mean_df["n_selected"],
mean_df["weighted_value"],
color="darkblue",
lw=2,
label="using utility",
)
plt.plot(
mean_df["n_selected"],
mean_df["unweighted_value"],
color="firebrick",
lw=2,
label="using predictions",
)
plt.grid(ls=":")
plt.xlabel("number selected")
plt.ylabel("Utility gained")
plt.title(
"noise_alpha:{} | interest:{}".format(noise_alpha, interest),
loc="left",
fontdict={"size": 14},
)
plt.legend()
sns.despine(left=True, bottom=True)
plt.savefig("sim_output.png")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment