Skip to content

Instantly share code, notes, and snippets.

@alejio
Created July 28, 2019 22:04
Show Gist options
  • Save alejio/65c2d6de970b1d959e47cd3fc95e9bd0 to your computer and use it in GitHub Desktop.
Save alejio/65c2d6de970b1d959e47cd3fc95e9bd0 to your computer and use it in GitHub Desktop.
Calculate lift
def lift_report(filename, septr='|', jitter_on=False):
## Import necessary packages
import sys
import pandas as pd
import numpy as np
import pandas.core.algorithms as algos
from pandas import Series
from sklearn.metrics import roc_auc_score
## Import data
df = pd.read_csv(filename, sep=septr)
## Checking requirements
if ('actual' not in df.columns) & ('score' not in df.columns):
msg = 'Input file should contain two columns named as "actual" and '
'"score"; instead got {}'.format(str(df.columns))
sys.exit(msg)
else:
pass
if (len(df.actual.unique())!=2)|((df.actual.min()!=0)&(df.actual.max()!=1)):
msg = 'This function is only used for binary classifiers with 0/1 '
'values; instead got {}'.format(sorted(df.actual.unique()))
sys.exit(msg)
else:
pass
if (df.score.min() < 0) | (df.score.max() >1):
msg = 'Scores should only have values [0,1]; '
'instead we have [{},{}]'.format(df.score.min(), df.score.max())
sys.exit(msg)
else:
pass
## Obtain deciles of scores.
# Sometimes binning will cause errors;
# hence we can add small perturbation to scores
# Relevant function
def jitter(a_series, noise_reduction=1000000):
return (np.random.random(len(a_series))*a_series.std()/
noise_reduction)-(a_series.std()/(2*noise_reduction))
if jitter_on == True:
df['decile'] = pd.qcut(df.score + jitter(df.score), 10)
else:
df['decile'] = pd.qcut(df.score, 10)
# Create groupby df object for further processing
group_temp = df.groupby('decile', as_index = False)
# Shape output df frame
df_grp = pd.DataFrame(group_temp.min().score)
df_grp['Decile Max'] = group_temp.max().score
df_grp.columns=['Decile Min', 'Decile Max']
# Within propensity deciles collect
# counts of Trues, Falses and totals
df_grp['Trues'] =group_temp.agg({'actual': lambda x: sum(abs(x))}).actual
df_grp['Falses'] = group_temp.agg({'actual':
lambda x: sum(abs(1-x))}).actual
df_grp['Total'] = group_temp.agg({'actual': lambda x: len(x)}).actual
del group_temp
# Sort ahead of cumulative operations
df_grp = (df_grp.sort_index(level = ['Decile Min', 'Decile Max'],
ascending=False)).reset_index(drop = True)
# Calculate Odds (#non-Trues/#Trues)
df_grp['Odds'] = (df_grp.Falses / df_grp.Trues)
# Calculate proportion of Trues within each decile
df_grp['Trues Proportion'] = (df_grp.Trues/df_grp.Total)
# Calculate running average of above proportion
df_grp['Cumulative Trues Proportion'] = ((df_grp.Trues.cumsum() /
df_grp.Total.cumsum()))
# Calculate proportion of within-decile Trues to all Trues for each decile
df_grp['Lift'] = (df_grp.Trues/df_grp.Trues.sum())
# Calculate gains as cumulative sum of above variable
df_grp['Gains']=(df_grp['Lift'].cumsum())
# Required for calculating K-S score
df_grp['Losses']=((df_grp.Falses/
df_grp.Falses.sum()).cumsum())
# Calculate K-S score as difference between cumulative Trues vs
# cumulative Falses percent
df_grp['K-S'] = abs(df_grp.Gains - df_grp.Losses)
# Format results nicely
df_grp['Decile Min'] = df_grp['Decile Min'].apply(lambda x: round(x,2))
df_grp['Decile Max'] = df_grp['Decile Max'].apply(lambda x: round(x,2))
df_grp['Odds'] = df_grp['Odds'].apply(lambda x: round(x,2))
df_grp['Trues Proportion'] = df_grp['Trues Proportion'].apply(lambda x: round(x,2))
df_grp['Cumulative Trues Proportion'] = \
df_grp['Cumulative Trues Proportion'].apply(lambda x: round(x,2))
df_grp['Lift'] = df_grp['Lift'].apply(lambda x: round(x,2))
df_grp['Gains'] = df_grp['Gains'].apply(lambda x: round(x,2))
df_grp['Losses'] = df_grp['Losses'].apply(lambda x:
round(x,2))
df_grp['K-S'] = df_grp['K-S'].apply(lambda x: round(x,4))
df_grp_basic = df_grp[['Decile Min', 'Decile Max', 'Lift', 'Gains']]
## Calculate performance statistics
# Get AUC score from sklearn
auc_score = roc_auc_score(df.actual, df.score)
# Calculate Gini score as 2*AUC-1
gini_score = 2*auc_score - 1
# Get maximum K-S separation
ks_score = df_grp['K-S'].max()
# Combine
scores = {'AUC score': auc_score, 'Gini score': gini_score,
'K-S Score' : ks_score}
return df, df_grp, df_grp_basic, scores
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment