Skip to content

Instantly share code, notes, and snippets.

@micahmelling
Last active September 2, 2021 01:30
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss
from random import shuffle
from copy import deepcopy
from modeling.evaluate import calculate_probability_lift
np.random.seed(10)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
def run_base_case(model, y_test, x_test):
preds_df = pd.DataFrame(model.predict_proba(x_test), columns=['0_prob', '1_prob'])
ll = log_loss(y_test, preds_df['1_prob'])
proba_lift = calculate_probability_lift(y_test, preds_df['1_prob'], '')
df = pd.DataFrame({
'scenario': ['base_case'],
'log_loss': [ll],
'proba_lift': [proba_lift]
})
return df
def run_univariate_drift_scenario(model, y_test, x_test, column, factor):
local_x_test = deepcopy(x_test)
local_x_test[column] = local_x_test[column] * factor
preds_df = pd.DataFrame(model.predict_proba(local_x_test), columns=['0_prob', '1_prob'])
preds_df = preds_df.reset_index(drop=True)
ll = log_loss(y_test, preds_df['1_prob'])
proba_lift = calculate_probability_lift(y_test, preds_df['1_prob'], '')
df = pd.DataFrame({
'scenario': [f'{column}_{factor}_shift'],
'log_loss': [ll],
'proba_lift': [proba_lift]
})
return df
def run_multivariate_drift_scenario(model, y_test, x_test, column_list, change_obs, factor, scenario_name):
local_x_test = deepcopy(x_test)
indices = list(local_x_test.index)
shuffle(indices)
indices = indices[0:change_obs]
for column in column_list:
for index in indices:
local_x_test[column][index] = local_x_test[column][index] * factor
preds_df = pd.DataFrame(model.predict_proba(local_x_test), columns=['0_prob', '1_prob'])
preds_df = preds_df.reset_index(drop=True)
ll = log_loss(y_test, preds_df['1_prob'])
proba_lift = calculate_probability_lift(y_test, preds_df['1_prob'], '')
df = pd.DataFrame({
'scenario': [scenario_name],
'log_loss': [ll],
'proba_lift': [proba_lift]
})
return df
def main():
x_test = joblib.load('modeling/random_forest_202107252202562870520500/data/x_test.pkl')
y_test = joblib.load('modeling/random_forest_202107252202562870520500/data/y_test.pkl')
model = joblib.load('modeling/random_forest_202107252202562870520500/model/model.pkl')
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
main_df = pd.DataFrame()
main_df = pd.concat([main_df, run_base_case(model, y_test, x_test)], axis=0)
main_df = pd.concat([main_df, run_univariate_drift_scenario(model, y_test, x_test, 'profile_score', 1.01)], axis=0)
main_df = pd.concat([main_df, run_univariate_drift_scenario(model, y_test, x_test, 'profile_score', 1.03)], axis=0)
main_df = pd.concat([main_df, run_univariate_drift_scenario(model, y_test, x_test, 'profile_score', 1.05)], axis=0)
main_df = pd.concat([main_df, run_univariate_drift_scenario(model, y_test, x_test, 'profile_score', 1.10)], axis=0)
main_df = pd.concat([main_df, run_multivariate_drift_scenario(model, y_test, x_test,
['profile_score', 'average_stars'], 10,
1.01, 'small_multivariate')], axis=0)
main_df = pd.concat([main_df, run_multivariate_drift_scenario(model, y_test, x_test,
['profile_score', 'average_stars'], 500,
1.05, 'moderate_multivariate')], axis=0)
main_df = pd.concat([main_df, run_multivariate_drift_scenario(model, y_test, x_test,
['profile_score', 'average_stars'],
1_500, 1.10, 'strong_multivariate')], axis=0)
main_df = pd.concat([main_df, run_multivariate_drift_scenario(model, y_test, x_test,
['profile_score', 'average_stars'],
4_000, 2.0, 'very_strong_multivariate')], axis=0)
return main_df
if __name__ == "__main__":
comparison_df = main()
print(comparison_df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment