Last active
May 21, 2020 10:35
-
-
Save sidravi1/86877a58bf6d534872311e08200177dc to your computer and use it in GitHub Desktop.
Expected utility vs. prediction probs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from sklearn.datasets import load_breast_cancer | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.model_selection import train_test_split | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
def get_data(): | |
cancer_dict = load_breast_cancer() | |
cancer_dict.keys() | |
X = pd.DataFrame(cancer_dict["data"], columns=cancer_dict["feature_names"]) | |
y = cancer_dict["target"] | |
X.rename({"worst concave points": "loan_amount"}, inplace=True, axis=1) | |
return train_test_split(X, y) | |
def get_probability_of_default(X_train, X_test, y_train, y_test): | |
clf = LogisticRegression(max_iter=10000).fit(X_train, y_train) | |
y_pred = clf.predict_proba(X_test)[:, 1] | |
return y_pred | |
def get_utlity(n_items, label, default_prob, loan_amount, interest=0.1): | |
utility = ( | |
default_prob * (-loan_amount) + (1 - default_prob) * interest * loan_amount | |
) | |
actual_utility = label * (-loan_amount) + (1 - label) * interest * loan_amount | |
pred_df = pd.DataFrame( | |
data=np.vstack([label, utility, default_prob, loan_amount, actual_utility]).T, | |
columns=["label", "utility", "prediction", "loan_amount", "actual_utility"], | |
) | |
top_weighted = pred_df.sort_values("utility")[-n_items:] # pick highest utility | |
weighted_value = top_weighted["actual_utility"].sum() | |
top = pred_df.sort_values("prediction")[:n_items] # pick lowest chance of default | |
value = top["actual_utility"].sum() | |
return pred_df, {"weighted_value": weighted_value, "unweighted_value": value} | |
def run_simulation(n_sims, noise_alpha=0.5, interest=0.1): | |
all_utilities = [] | |
datasets = get_data() | |
y_pred = get_probability_of_default(*datasets) | |
for sim in range(n_sims): | |
y_pred_noised = ( | |
y_pred + np.random.beta(noise_alpha, noise_alpha, size=y_pred.shape[0]) | |
) / 2 | |
df = pd.DataFrame( | |
[ | |
get_utlity( | |
n_items, | |
datasets[3], | |
y_pred_noised, | |
datasets[1]["loan_amount"], | |
interest, | |
)[1] | |
for n_items in np.arange(1, datasets[3].shape[0] + 1) | |
] | |
) | |
df["n_selected"] = np.arange(1, datasets[3].shape[0] + 1) | |
df["sim"] = sim | |
all_utilities.append(df) | |
all_utilities_df = pd.concat(all_utilities, ignore_index=True) | |
return all_utilities_df | |
if __name__ == "__main__": | |
n_sim = 100 | |
noise_alpha = 0.5 | |
interest = 0.3 | |
all_utilities_df = run_simulation(n_sim, noise_alpha=0.5, interest=0.3) | |
f = plt.figure(figsize=(7, 5)) | |
for sim in range(n_sim): | |
df = all_utilities_df[all_utilities_df["sim"] == sim] | |
plt.plot( | |
df["n_selected"], | |
df["weighted_value"], | |
color="dodgerblue", | |
alpha=0.2, | |
lw=0.5, | |
) | |
plt.plot( | |
df["n_selected"], df["unweighted_value"], color="pink", alpha=0.3, lw=0.5 | |
) | |
mean_df = all_utilities_df.groupby(["n_selected"]).mean().reset_index() | |
plt.plot( | |
mean_df["n_selected"], | |
mean_df["weighted_value"], | |
color="darkblue", | |
lw=2, | |
label="using utility", | |
) | |
plt.plot( | |
mean_df["n_selected"], | |
mean_df["unweighted_value"], | |
color="firebrick", | |
lw=2, | |
label="using predictions", | |
) | |
plt.grid(ls=":") | |
plt.xlabel("number selected") | |
plt.ylabel("Utility gained") | |
plt.title( | |
"noise_alpha:{} | interest:{}".format(noise_alpha, interest), | |
loc="left", | |
fontdict={"size": 14}, | |
) | |
plt.legend() | |
sns.despine(left=True, bottom=True) | |
plt.savefig("sim_output.png") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment