Skip to content

Instantly share code, notes, and snippets.

@sureshsigma
Last active July 22, 2018 10:54
Show Gist options
  • Save sureshsigma/5b48a195ee93ade9c3168d815ce320b6 to your computer and use it in GitHub Desktop.
Save sureshsigma/5b48a195ee93ade9c3168d815ce320b6 to your computer and use it in GitHub Desktop.
Model Evaluation Measures
class SP_Dev:
import numpy as np
import pandas as pd
def __init__(self,prob,resp):
self.prob=prob
self.resp=resp
def Hosmer_Lemeshow(self,g):
from scipy.stats import chi2
df=pd.DataFrame({'prob':self.prob,'resp':self.resp})
print("|======= Response Distribution =======|")
print(df['resp'].value_counts())
df.sort_values('prob',ascending=False,inplace=True)
df['score_decile'] = pd.qcut(df['prob'], g)
obsevents_pos = df['resp'].groupby(df.score_decile).sum()
obsevents_neg = df['resp'].groupby(df.score_decile).count() - obsevents_pos
expevents_pos = df['prob'].groupby(df.score_decile).sum()
expevents_neg = df['prob'].groupby(df.score_decile).count() - expevents_pos
cal_HL_statistics = (((obsevents_pos - expevents_pos)**2/expevents_pos) + ((obsevents_neg - expevents_neg)**2/expevents_neg)).sum()
print("|------------------------------------|")
print(" Hosmer- Lemeshow Statistic Value")
print(cal_HL_statistics)
print("|------------------------------------|")
print(" P Value")
p_val=1-chi2.cdf(cal_HL_statistics,8)
print(p_val)
print("|------------------------------------|")
def Decile(self):
df1=pd.DataFrame({'prob':self.prob,'resp':self.resp})
df1.sort_values('prob',ascending=False,inplace=True)
df1['score_decile'] = pd.qcut(df1['prob'], 10)
df1.score_decile=df1.score_decile.astype('str')
l=df1.score_decile.str.split(',',expand=True)
l[0]=l[0].str.replace('(','')
l[1]=l[1].str.replace(']','')
l[0]=pd.to_numeric(l[0],errors='coerce')
l[1]=pd.to_numeric(l[1],errors='coerce')
df1['min_prob']=l[0]
df1['max_prob']=l[1]
p=np.sort(np.unique(df1.max_prob))
df1['DEC']=0
df1['DEC']=np.where(df1.prob<=1,1,df1['DEC'])
df1['DEC']=np.where(df1.prob<=p[8],2,df1['DEC'])
df1['DEC']=np.where(df1.prob<=p[7],3,df1['DEC'])
df1['DEC']=np.where(df1.prob<=p[6],4,df1['DEC'])
df1['DEC']=np.where(df1.prob<=p[5],5,df1['DEC'])
df1['DEC']=np.where(df1.prob<=p[4],6,df1['DEC'])
df1['DEC']=np.where(df1.prob<=p[3],7,df1['DEC'])
df1['DEC']=np.where(df1.prob<=p[2],8,df1['DEC'])
df1['DEC']=np.where(df1.prob<=p[1],9,df1['DEC'])
df1['DEC']=np.where(df1.prob<=p[0],10,df1['DEC'])
return(df1)
def KS(self):
d=self.Decile()
ks=pd.pivot_table(d,index='DEC',values='resp',aggfunc=[len,sum])
ks.columns=['N','response']
ks['decile']=ks.index
ks.reset_index(drop=True,inplace=True)
ks['non_resp']=ks.N-ks.response
ks['resp_rate']=ks.response/ks.N
ks['non_resp_rate']=ks.non_resp/ks.N
ks['resp_per']=ks.response/ks.response.sum()*100
ks['non_resp_per']=ks.non_resp/ks.non_resp.sum()*100
ks['cum_resp_per']=np.cumsum(ks.resp_per)
ks['cum_non_resp_rate']=np.cumsum(ks.non_resp_per)
ks['KS_value']=round((ks.cum_resp_per-ks.cum_non_resp_rate)/100,2)
ks['Lift']=(ks.cum_resp_per/100)/(np.cumsum(ks.N)/(sum(ks.N)*1.0))
return(ks)
def Concordance(self):
df2=pd.DataFrame({'prob':self.prob,'resp':self.resp})
Event=df2.loc[df2.resp==1]
Non_Event=df2.loc[df2.resp==0]
Pairs=0
Conc=0
Disc=0
Ties=0
for i in Event.prob:
for j in Non_Event.prob:
Pairs+=1
if(i>j):
Conc+=1
elif(i<j):
Disc+=1
else:
Ties+=1
print("-----------------------------------------------------------")
print(" Total Pairs :", Pairs)
print(" Percentage of Concordance :",round(Conc/Pairs*100,2),"%")
print(" Percentage of Discordance :",round(Disc/Pairs*100,2),"%")
print(" Percentage of Ties :",round(Ties/Pairs*100,2),"%")
print("-----------------------------------------------------------")
def ScoreBand(prob,train_resp,score):
k=SP_Dev(prob,train_resp)
new=k.Decile()
p=np.sort(np.unique(new.max_prob))
df3=pd.DataFrame({'p_prob':score})
df3['DEC']=0
df3['DEC']=np.where(df3.p_prob<=1,1,df3['DEC'])
df3['DEC']=np.where(df3.p_prob<=p[8],2,df3['DEC'])
df3['DEC']=np.where(df3.p_prob<=p[7],3,df3['DEC'])
df3['DEC']=np.where(df3.p_prob<=p[6],4,df3['DEC'])
df3['DEC']=np.where(df3.p_prob<=p[5],5,df3['DEC'])
df3['DEC']=np.where(df3.p_prob<=p[4],6,df3['DEC'])
df3['DEC']=np.where(df3.p_prob<=p[3],7,df3['DEC'])
df3['DEC']=np.where(df3.p_prob<=p[2],8,df3['DEC'])
df3['DEC']=np.where(df3.p_prob<=p[1],9,df3['DEC'])
df3['DEC']=np.where(df3.p_prob<=p[0],10,df3['DEC'])
return(df3)
class SP_Val:
import numpy as np
import pandas as pd
def __init__(self,prob,train_resp,score,test_resp):
self.prob=prob
self.train_resp=train_resp
self.score=score
self.test_resp=test_resp
def KS(self):
d=ScoreBand(self.prob,self.train_resp,self.score)
resp=self.test_resp
resp.reset_index(drop=True,inplace=True)
d['resp']=resp
ks=pd.pivot_table(d,index='DEC',values='resp',aggfunc=[len,sum])
ks.columns=['N','response']
ks['decile']=ks.index
ks.reset_index(drop=True,inplace=True)
ks['non_resp']=ks.N-ks.response
ks['resp_rate']=ks.response/ks.N
ks['non_resp_rate']=ks.non_resp/ks.N
ks['resp_per']=ks.response/ks.response.sum()*100
ks['non_resp_per']=ks.non_resp/ks.non_resp.sum()*100
ks['cum_resp_per']=np.cumsum(ks.resp_per)
ks['cum_non_resp_rate']=np.cumsum(ks.non_resp_per)
ks['KS_value']=round((ks.cum_resp_per-ks.cum_non_resp_rate)/100,2)
ks['Lift']=(ks.cum_resp_per/100)/(np.cumsum(ks.N)/(sum(ks.N)*1.0))
return(ks)
def PSI(self):
l=SP_Dev(self.prob,self.train_resp)
Dev=l.Decile()
E=round(Dev.DEC.value_counts()/len(Dev.DEC)*100,2)
Sc=ScoreBand(self.prob,train_resp,self.score)
A=round(Sc.DEC.value_counts()/len(Sc.DEC)*100,2)
A1=pd.DataFrame({'A':A,'DEC':A.index})
E1=pd.DataFrame({'E':E,'DEC':E.index})
P=A1.merge(E1, left_on='DEC', right_on='DEC', how='inner')
sub=P.A-P.E
ln=np.log(P.A/P.E)
P['PSI_val']=sub*ln
P.sort_values('DEC',ascending=True,inplace=True)
t=sum(P.PSI_val)/100
print("-----------------------------------------------------------")
print("PSI Value :",t)
if t<0.1:
print("Green: No action required")
elif t<0.25:
print("Orange: Check other scorecard monitoring metrics")
else:
print("Red: Need to delvelop")
print("-----------------------------------------------------------")
return(P)
# **Bank Marketing Data Set**
# http://archive.ics.uci.edu/ml/datasets/Bank+Marketing
# The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution.
# The classification goal is to predict if the client will subscribe a term deposit (variable y).
bank=pd.read_csv("bank_full.csv",sep=";")
type(bank)
bank.shape # Dimention of dataset
bank.shape[0] #row
bank.shape[1] #columns
bank.columns # columns names
bank.dtypes
bank['y'].value_counts()
bank['y']=np.where(bank['y']=='yes',1,0)
bank['y'].value_counts()
cat_vars=bank.select_dtypes(['object']).columns
cat_vars
# creating dummy for categories
for col in cat_vars:
dummy=pd.get_dummies(bank[col],drop_first=True,prefix=col)
bank=pd.concat([bank,dummy],axis=1)
del bank[col]
print(col)
del dummy
from sklearn.model_selection import train_test_split
bk_train,bk_test=train_test_split(bank,test_size=0.25,random_state=1)
print(bk_train.shape)
print(bk_test.shape)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
logr=LogisticRegression(class_weight='balanced')
x_train=bk_train.drop('y',axis=1)
y_train=bk_train['y']
x_test=bk_test.drop('y',axis=1)
y_test=bk_test['y']
logr.fit(x_train,y_train)
train_score=logr.predict_proba(x_train)[:,1]
train_resp=y_train
test_resp=y_test
test_score=logr.predict_proba(x_test)[:,1]
roc_auc_score(y_test,test_score)
k1=SP_Dev(train_score,train_resp)
k2=SP_Val(train_score,train_resp,test_score,y_test)
ks=k1.KS()
ks
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(ks.decile, ks.Lift, color='red', linewidth=3)
#ax.bar(ks.decile,ks.Lift,color='blue')
ax.set(title='Lift Chart', ylabel='Lift', xlabel='Decile')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment