/data_drift_comparison.py Secret
Last active
September 2, 2021 01:30
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import joblib | |
import pandas as pd | |
import numpy as np | |
from sklearn.metrics import log_loss | |
from random import shuffle | |
from copy import deepcopy | |
from modeling.evaluate import calculate_probability_lift | |
np.random.seed(10) | |
pd.set_option('display.max_rows', None) | |
pd.set_option('display.max_columns', None) | |
pd.set_option('display.width', None) | |
def run_base_case(model, y_test, x_test): | |
preds_df = pd.DataFrame(model.predict_proba(x_test), columns=['0_prob', '1_prob']) | |
ll = log_loss(y_test, preds_df['1_prob']) | |
proba_lift = calculate_probability_lift(y_test, preds_df['1_prob'], '') | |
df = pd.DataFrame({ | |
'scenario': ['base_case'], | |
'log_loss': [ll], | |
'proba_lift': [proba_lift] | |
}) | |
return df | |
def run_univariate_drift_scenario(model, y_test, x_test, column, factor): | |
local_x_test = deepcopy(x_test) | |
local_x_test[column] = local_x_test[column] * factor | |
preds_df = pd.DataFrame(model.predict_proba(local_x_test), columns=['0_prob', '1_prob']) | |
preds_df = preds_df.reset_index(drop=True) | |
ll = log_loss(y_test, preds_df['1_prob']) | |
proba_lift = calculate_probability_lift(y_test, preds_df['1_prob'], '') | |
df = pd.DataFrame({ | |
'scenario': [f'{column}_{factor}_shift'], | |
'log_loss': [ll], | |
'proba_lift': [proba_lift] | |
}) | |
return df | |
def run_multivariate_drift_scenario(model, y_test, x_test, column_list, change_obs, factor, scenario_name): | |
local_x_test = deepcopy(x_test) | |
indices = list(local_x_test.index) | |
shuffle(indices) | |
indices = indices[0:change_obs] | |
for column in column_list: | |
for index in indices: | |
local_x_test[column][index] = local_x_test[column][index] * factor | |
preds_df = pd.DataFrame(model.predict_proba(local_x_test), columns=['0_prob', '1_prob']) | |
preds_df = preds_df.reset_index(drop=True) | |
ll = log_loss(y_test, preds_df['1_prob']) | |
proba_lift = calculate_probability_lift(y_test, preds_df['1_prob'], '') | |
df = pd.DataFrame({ | |
'scenario': [scenario_name], | |
'log_loss': [ll], | |
'proba_lift': [proba_lift] | |
}) | |
return df | |
def main(): | |
x_test = joblib.load('modeling/random_forest_202107252202562870520500/data/x_test.pkl') | |
y_test = joblib.load('modeling/random_forest_202107252202562870520500/data/y_test.pkl') | |
model = joblib.load('modeling/random_forest_202107252202562870520500/model/model.pkl') | |
x_test = x_test.reset_index(drop=True) | |
y_test = y_test.reset_index(drop=True) | |
main_df = pd.DataFrame() | |
main_df = pd.concat([main_df, run_base_case(model, y_test, x_test)], axis=0) | |
main_df = pd.concat([main_df, run_univariate_drift_scenario(model, y_test, x_test, 'profile_score', 1.01)], axis=0) | |
main_df = pd.concat([main_df, run_univariate_drift_scenario(model, y_test, x_test, 'profile_score', 1.03)], axis=0) | |
main_df = pd.concat([main_df, run_univariate_drift_scenario(model, y_test, x_test, 'profile_score', 1.05)], axis=0) | |
main_df = pd.concat([main_df, run_univariate_drift_scenario(model, y_test, x_test, 'profile_score', 1.10)], axis=0) | |
main_df = pd.concat([main_df, run_multivariate_drift_scenario(model, y_test, x_test, | |
['profile_score', 'average_stars'], 10, | |
1.01, 'small_multivariate')], axis=0) | |
main_df = pd.concat([main_df, run_multivariate_drift_scenario(model, y_test, x_test, | |
['profile_score', 'average_stars'], 500, | |
1.05, 'moderate_multivariate')], axis=0) | |
main_df = pd.concat([main_df, run_multivariate_drift_scenario(model, y_test, x_test, | |
['profile_score', 'average_stars'], | |
1_500, 1.10, 'strong_multivariate')], axis=0) | |
main_df = pd.concat([main_df, run_multivariate_drift_scenario(model, y_test, x_test, | |
['profile_score', 'average_stars'], | |
4_000, 2.0, 'very_strong_multivariate')], axis=0) | |
return main_df | |
if __name__ == "__main__": | |
comparison_df = main() | |
print(comparison_df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment