Created
July 28, 2019 22:04
-
-
Save alejio/65c2d6de970b1d959e47cd3fc95e9bd0 to your computer and use it in GitHub Desktop.
Calculate lift
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def lift_report(filename, septr='|', jitter_on=False): | |
## Import necessary packages | |
import sys | |
import pandas as pd | |
import numpy as np | |
import pandas.core.algorithms as algos | |
from pandas import Series | |
from sklearn.metrics import roc_auc_score | |
## Import data | |
df = pd.read_csv(filename, sep=septr) | |
## Checking requirements | |
if ('actual' not in df.columns) & ('score' not in df.columns): | |
msg = 'Input file should contain two columns named as "actual" and ' | |
'"score"; instead got {}'.format(str(df.columns)) | |
sys.exit(msg) | |
else: | |
pass | |
if (len(df.actual.unique())!=2)|((df.actual.min()!=0)&(df.actual.max()!=1)): | |
msg = 'This function is only used for binary classifiers with 0/1 ' | |
'values; instead got {}'.format(sorted(df.actual.unique())) | |
sys.exit(msg) | |
else: | |
pass | |
if (df.score.min() < 0) | (df.score.max() >1): | |
msg = 'Scores should only have values [0,1]; ' | |
'instead we have [{},{}]'.format(df.score.min(), df.score.max()) | |
sys.exit(msg) | |
else: | |
pass | |
## Obtain deciles of scores. | |
# Sometimes binning will cause errors; | |
# hence we can add small perturbation to scores | |
# Relevant function | |
def jitter(a_series, noise_reduction=1000000): | |
return (np.random.random(len(a_series))*a_series.std()/ | |
noise_reduction)-(a_series.std()/(2*noise_reduction)) | |
if jitter_on == True: | |
df['decile'] = pd.qcut(df.score + jitter(df.score), 10) | |
else: | |
df['decile'] = pd.qcut(df.score, 10) | |
# Create groupby df object for further processing | |
group_temp = df.groupby('decile', as_index = False) | |
# Shape output df frame | |
df_grp = pd.DataFrame(group_temp.min().score) | |
df_grp['Decile Max'] = group_temp.max().score | |
df_grp.columns=['Decile Min', 'Decile Max'] | |
# Within propensity deciles collect | |
# counts of Trues, Falses and totals | |
df_grp['Trues'] =group_temp.agg({'actual': lambda x: sum(abs(x))}).actual | |
df_grp['Falses'] = group_temp.agg({'actual': | |
lambda x: sum(abs(1-x))}).actual | |
df_grp['Total'] = group_temp.agg({'actual': lambda x: len(x)}).actual | |
del group_temp | |
# Sort ahead of cumulative operations | |
df_grp = (df_grp.sort_index(level = ['Decile Min', 'Decile Max'], | |
ascending=False)).reset_index(drop = True) | |
# Calculate Odds (#non-Trues/#Trues) | |
df_grp['Odds'] = (df_grp.Falses / df_grp.Trues) | |
# Calculate proportion of Trues within each decile | |
df_grp['Trues Proportion'] = (df_grp.Trues/df_grp.Total) | |
# Calculate running average of above proportion | |
df_grp['Cumulative Trues Proportion'] = ((df_grp.Trues.cumsum() / | |
df_grp.Total.cumsum())) | |
# Calculate proportion of within-decile Trues to all Trues for each decile | |
df_grp['Lift'] = (df_grp.Trues/df_grp.Trues.sum()) | |
# Calculate gains as cumulative sum of above variable | |
df_grp['Gains']=(df_grp['Lift'].cumsum()) | |
# Required for calculating K-S score | |
df_grp['Losses']=((df_grp.Falses/ | |
df_grp.Falses.sum()).cumsum()) | |
# Calculate K-S score as difference between cumulative Trues vs | |
# cumulative Falses percent | |
df_grp['K-S'] = abs(df_grp.Gains - df_grp.Losses) | |
# Format results nicely | |
df_grp['Decile Min'] = df_grp['Decile Min'].apply(lambda x: round(x,2)) | |
df_grp['Decile Max'] = df_grp['Decile Max'].apply(lambda x: round(x,2)) | |
df_grp['Odds'] = df_grp['Odds'].apply(lambda x: round(x,2)) | |
df_grp['Trues Proportion'] = df_grp['Trues Proportion'].apply(lambda x: round(x,2)) | |
df_grp['Cumulative Trues Proportion'] = \ | |
df_grp['Cumulative Trues Proportion'].apply(lambda x: round(x,2)) | |
df_grp['Lift'] = df_grp['Lift'].apply(lambda x: round(x,2)) | |
df_grp['Gains'] = df_grp['Gains'].apply(lambda x: round(x,2)) | |
df_grp['Losses'] = df_grp['Losses'].apply(lambda x: | |
round(x,2)) | |
df_grp['K-S'] = df_grp['K-S'].apply(lambda x: round(x,4)) | |
df_grp_basic = df_grp[['Decile Min', 'Decile Max', 'Lift', 'Gains']] | |
## Calculate performance statistics | |
# Get AUC score from sklearn | |
auc_score = roc_auc_score(df.actual, df.score) | |
# Calculate Gini score as 2*AUC-1 | |
gini_score = 2*auc_score - 1 | |
# Get maximum K-S separation | |
ks_score = df_grp['K-S'].max() | |
# Combine | |
scores = {'AUC score': auc_score, 'Gini score': gini_score, | |
'K-S Score' : ks_score} | |
return df, df_grp, df_grp_basic, scores |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment