Last active
March 1, 2022 02:30
-
-
Save accessnash/faa3d44b768246ccf2ad3a83d5cd58b7 to your computer and use it in GitHub Desktop.
Building a PD model using loans data : Part I - data cleaning from original data set, choosing variables for building PD model and estimation ; Part II: Out-of-sample PD model validation, creating score cards from PD model and setting cut-offs for approval/rejection
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Mon Dec 10 19:05:13 2020 | |
@author: User | |
""" | |
#Part I: Data cleaning, choosing variables & PD Model estimation | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
sns.set() | |
loan_data_backup = pd.read_csv('loan_data_2007_2014.csv') | |
loan_data = loan_data_backup.copy() | |
pd.options.display.max_columns = None | |
loan_data.columns.values | |
loan_data.info() | |
#Pre-processing the 'emp_length' variable | |
loan_data['emp_length'].unique() | |
loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '') | |
loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0)) | |
loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0)) | |
loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '') | |
loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '') | |
type(loan_data['emp_length_int'][0]) | |
loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int']) | |
#Pre-processing the 'term' variable | |
loan_data['term'].describe() | |
loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', '')) | |
#Pre-processing the 'earliest_cr_line' variable | |
loan_data['earliest_cr_line'].describe() | |
loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y') | |
type(loan_data['earliest_cr_line_date'][0]) | |
loan_data['months_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date'])/np.timedelta64(1, 'M') )) | |
loan_data['months_since_earliest_cr_line'].describe() | |
loan_data.loc[:, ['earliest_cr_line', 'earliest_cr_line_date', 'months_since_earliest_cr_line']][loan_data['months_since_earliest_cr_line'] < 0] | |
loan_data['months_since_earliest_cr_line'][loan_data['months_since_earliest_cr_line'] < 0] = loan_data['months_since_earliest_cr_line'].max() # this step is needed to remove the negative values | |
min(loan_data['months_since_earliest_cr_line']) | |
#Pre-processing the 'issue_d' variable | |
loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y') | |
loan_data['months_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date'])/np.timedelta64(1, 'M') )) | |
loan_data['months_since_issue_d'].describe() | |
# Pre-processing discrete variables | |
loan_data_dummies = [pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':'), | |
pd.get_dummies(loan_data['sub_grade'], prefix = 'sub_grade', prefix_sep = ':'), | |
pd.get_dummies(loan_data['home_ownership'], prefix = 'home_ownership', prefix_sep = ':'), | |
pd.get_dummies(loan_data['verification_status'], prefix = 'verification_status', prefix_sep = ':'), | |
pd.get_dummies(loan_data['loan_status'], prefix = 'loan_status', prefix_sep = ':'), | |
pd.get_dummies(loan_data['purpose'], prefix = 'purpose', prefix_sep = ':'), | |
pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state', prefix_sep = ':'), | |
pd.get_dummies(loan_data['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ':') ] | |
loan_data_dummies = pd.concat(loan_data_dummies, axis = 1) | |
type(loan_data_dummies) | |
loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1) | |
# Check for missing values & clean | |
loan_data.isnull() | |
pd.options.display.max_rows = None | |
loan_data.isnull().sum() | |
loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'], inplace = True) | |
loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(), inplace = True) | |
loan_data['months_since_earliest_cr_line'].fillna(0, inplace = True) | |
loan_data['acc_now_delinq'].fillna(0, inplace = True) | |
loan_data['total_acc'].fillna(0, inplace = True) | |
loan_data['pub_rec'].fillna(0, inplace = True) | |
loan_data['open_acc'].fillna(0, inplace = True) | |
loan_data['inq_last_6mths'].fillna(0, inplace = True) | |
loan_data['delinq_2yrs'].fillna(0, inplace = True) | |
loan_data['emp_length_int'].fillna(0, inplace = True) | |
# Good/Bad definition for dependent variable | |
loan_data['loan_status'].unique() | |
loan_data['loan_status'].value_counts() | |
loan_data['loan_status'].value_counts() / loan_data['loan_status'].count() | |
loan_data['good_bad'] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default', | |
'Does not meet the credit policy. Status:Charged Off', | |
'Late (31-120 days)']), 0, 1) | |
#Data preparation | |
from sklearn.model_selection import train_test_split | |
loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'], test_size = 0.2, random_state = 42) | |
#df_inputs_prepr = loan_data_inputs_train | |
#df_targets_prepr = loan_data_targets_train | |
df_inputs_prepr = loan_data_inputs_test | |
df_targets_prepr = loan_data_targets_test | |
def woe_discrete(df, discrete_variable_name, good_bad_variable_df): | |
df = pd.concat([df[discrete_variable_name], good_bad_variable_df], axis = 1) | |
df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(), | |
df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1) | |
df = df.iloc[:, [0,1,3]] | |
df.columns = [df.columns.values[0], 'n_obs', 'prop_good'] | |
df['prop_n_obs'] = df['n_obs']/df['n_obs'].sum() | |
df['n_good'] = df['prop_good'] * df['n_obs'] | |
df['n_bad'] = (1 - df['prop_good']) * df['n_obs'] | |
df['prop_n_good'] = df['n_good']/ df['n_good'].sum() | |
df['prop_n_bad'] = df['n_bad']/ df['n_bad'].sum() | |
df['WOE'] = np.log(df['prop_n_good']/df['prop_n_bad']) | |
df = df.sort_values(['WOE']) | |
df = df.reset_index(drop = True) | |
df['diff_prop_good'] = df['prop_good'].diff().abs() | |
df['diff_WOE'] = df['WOE'].diff().abs() | |
df['IV'] = (df['prop_n_good'] - df['prop_n_bad']) * df['WOE'] | |
df['IV'] = df['IV'].sum() | |
return df | |
df_temp = woe_discrete(df_inputs_prepr, 'grade', df_targets_prepr) | |
df_temp | |
def plot_by_woe(df_WoE, rotation_of_x_axis_labels = 0): | |
x = np.array(df_WoE.iloc[:, 0].apply(str)) | |
y = df_WoE['WOE'] | |
plt.figure(figsize = (18, 6)) | |
plt.plot(x, y, marker = 'o', linestyle = '--', color = 'k') | |
plt.xlabel(df_WoE.columns[0]) | |
plt.ylabel('Weight of Evidence') | |
plt.title(str('Weight of Evidence by ' + df_WoE.columns[0])) | |
plt.xticks(rotation = rotation_of_x_axis_labels) | |
plot_by_woe(df_temp) | |
df_temp = woe_discrete(df_inputs_prepr, 'home_ownership', df_targets_prepr) | |
df_temp | |
plot_by_woe(df_temp) | |
df_inputs_prepr['home_ownership:RENT_OTHER_NONE_ANY'] = sum([df_inputs_prepr['home_ownership:RENT'], df_inputs_prepr['home_ownership:OTHER'], | |
df_inputs_prepr['home_ownership:NONE'], df_inputs_prepr['home_ownership:ANY']]) | |
df_temp = woe_discrete(df_inputs_prepr, 'addr_state', df_targets_prepr) | |
df_temp | |
plot_by_woe(df_temp) | |
if ['addr_state:ND'] in df_inputs_prepr.columns.values: | |
pass | |
else: | |
df_inputs_prepr['addr_state:ND'] = 0 | |
plot_by_woe(df_temp.iloc[2: -2, :]) #removing Nebraska and Iowa as they've very few observations & probably the reason for the low WoE. Also Maine & Idaho which have no info | |
plot_by_woe(df_temp.iloc[6: -6, :]) | |
df_inputs_prepr['addr_state:ND_NE_IA_NV_FL_HI_AL'] = sum([df_inputs_prepr['addr_state:ND'], df_inputs_prepr['addr_state:NE'], df_inputs_prepr['addr_state:IA'], | |
df_inputs_prepr['addr_state:NV'], df_inputs_prepr['addr_state:FL'], df_inputs_prepr['addr_state:HI'], | |
df_inputs_prepr['addr_state:AL']]) | |
df_inputs_prepr['addr_state:NM_VA'] = sum([df_inputs_prepr['addr_state:NM'], df_inputs_prepr['addr_state:VA']]) | |
df_inputs_prepr['addr_state:OK_TN_MO_LA_MD_NC'] = sum([df_inputs_prepr['addr_state:OK'], df_inputs_prepr['addr_state:TN'], df_inputs_prepr['addr_state:MO'], | |
df_inputs_prepr['addr_state:LA'], df_inputs_prepr['addr_state:MD'], df_inputs_prepr['addr_state:NC']]) | |
df_inputs_prepr['addr_state:UT_KY_AZ_NJ'] = sum([df_inputs_prepr['addr_state:UT'], df_inputs_prepr['addr_state:KY'], | |
df_inputs_prepr['addr_state:AZ'], df_inputs_prepr['addr_state:NJ']]) | |
df_inputs_prepr['addr_state:AR_MI_PA_OH_MN'] = sum([df_inputs_prepr['addr_state:AR'], df_inputs_prepr['addr_state:MI'], df_inputs_prepr['addr_state:PA'], | |
df_inputs_prepr['addr_state:OH'], df_inputs_prepr['addr_state:MN']]) | |
df_inputs_prepr['addr_state:RI_MA_DE_SD_IN'] = sum([df_inputs_prepr['addr_state:RI'], df_inputs_prepr['addr_state:MA'], df_inputs_prepr['addr_state:DE'], | |
df_inputs_prepr['addr_state:SD'], df_inputs_prepr['addr_state:IN']]) | |
df_inputs_prepr['addr_state:GA_WA_OR'] = sum([df_inputs_prepr['addr_state:GA'], df_inputs_prepr['addr_state:WA'], df_inputs_prepr['addr_state:OR']]) | |
df_inputs_prepr['addr_state:WI_MT'] = sum([df_inputs_prepr['addr_state:WI'], df_inputs_prepr['addr_state:MT']]) | |
df_inputs_prepr['addr_state:IL_CT'] = sum([df_inputs_prepr['addr_state:IL'], df_inputs_prepr['addr_state:CT']]) | |
df_inputs_prepr['addr_state:KS_SC_CO_VT_AK_MS'] = sum([df_inputs_prepr['addr_state:KS'], df_inputs_prepr['addr_state:SC'], df_inputs_prepr['addr_state:CO'], | |
df_inputs_prepr['addr_state:VT'], df_inputs_prepr['addr_state:AK'], df_inputs_prepr['addr_state:MS']]) | |
df_inputs_prepr['addr_state:WV_NH_WY_DC_ME_ID'] = sum([df_inputs_prepr['addr_state:WV'], df_inputs_prepr['addr_state:NH'], df_inputs_prepr['addr_state:WY'], | |
df_inputs_prepr['addr_state:DC'], df_inputs_prepr['addr_state:ME'], df_inputs_prepr['addr_state:ID']]) | |
df_temp = woe_discrete(df_inputs_prepr, 'verification_status', df_targets_prepr) | |
df_temp | |
plot_by_woe(df_temp) | |
df_temp = woe_discrete(df_inputs_prepr, 'purpose', df_targets_prepr) | |
df_temp | |
plot_by_woe(df_temp) | |
df_inputs_prepr['purpose:educ__sm_b__wedd__ren_en__mov__house'] = sum([df_inputs_prepr['purpose:educational'], df_inputs_prepr['purpose:small_business'], | |
df_inputs_prepr['purpose:wedding'], df_inputs_prepr['purpose:renewable_energy'], | |
df_inputs_prepr['purpose:moving'], df_inputs_prepr['purpose:house']]) | |
df_inputs_prepr['purpose:oth__med__vacation'] = sum([df_inputs_prepr['purpose:other'], df_inputs_prepr['purpose:medical'], | |
df_inputs_prepr['purpose:vacation']]) | |
df_inputs_prepr['purpose:major_purch__car__home_impr'] = sum([df_inputs_prepr['purpose:major_purchase'], df_inputs_prepr['purpose:car'], | |
df_inputs_prepr['purpose:home_improvement']]) | |
df_temp = woe_discrete(df_inputs_prepr, 'initial_list_status', df_targets_prepr) | |
df_temp | |
plot_by_woe(df_temp) | |
#Preprocessing continuous variables | |
def woe_ordered_continuous(df, continuous_variable_name, good_bad_variable_df): | |
df = pd.concat([df[continuous_variable_name], good_bad_variable_df], axis = 1) | |
df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(), | |
df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1) | |
df = df.iloc[:, [0,1,3]] | |
df.columns = [df.columns.values[0], 'n_obs', 'prop_good'] | |
df['prop_n_obs'] = df['n_obs']/df['n_obs'].sum() | |
df['n_good'] = df['prop_good'] * df['n_obs'] | |
df['n_bad'] = (1 - df['prop_good']) * df['n_obs'] | |
df['prop_n_good'] = df['n_good']/ df['n_good'].sum() | |
df['prop_n_bad'] = df['n_bad']/ df['n_bad'].sum() | |
df['WOE'] = np.log(df['prop_n_good']/df['prop_n_bad']) | |
df['diff_prop_good'] = df['prop_good'].diff().abs() | |
df['diff_WOE'] = df['WOE'].diff().abs() | |
df['IV'] = (df['prop_n_good'] - df['prop_n_bad']) * df['WOE'] | |
df['IV'] = df['IV'].sum() | |
return df | |
df_inputs_prepr['term_int'].unique() | |
df_temp = woe_ordered_continuous(df_inputs_prepr, 'term_int', df_targets_prepr) | |
plot_by_woe(df_temp) | |
df_inputs_prepr['term:36'] = np.where((df_inputs_prepr['term_int']==36), 1, 0) | |
df_inputs_prepr['term:60'] = np.where((df_inputs_prepr['term_int']==60), 1, 0) | |
df_inputs_prepr['emp_length_int'].unique() | |
df_temp = woe_ordered_continuous(df_inputs_prepr, 'emp_length_int', df_targets_prepr) | |
plot_by_woe(df_temp) | |
df_inputs_prepr['emp_length:0'] = np.where(df_inputs_prepr['emp_length_int'].isin([0]), 1, 0) | |
df_inputs_prepr['emp_length:1'] = np.where(df_inputs_prepr['emp_length_int'].isin([1]), 1, 0) | |
df_inputs_prepr['emp_length:2-4'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(2,5)), 1, 0) | |
df_inputs_prepr['emp_length:5-6'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(5, 7)), 1, 0) | |
df_inputs_prepr['emp_length:7-9'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(7, 10)), 1, 0) | |
df_inputs_prepr['emp_length:10'] = np.where(df_inputs_prepr['emp_length_int'].isin([10]), 1, 0) | |
df_inputs_prepr['months_since_issue_d'].unique() | |
df_inputs_prepr['months_since_issue_d_factor'] = pd.cut(df_inputs_prepr['months_since_issue_d'], 50) | |
df_temp = woe_ordered_continuous(df_inputs_prepr, 'months_since_issue_d_factor', df_targets_prepr) | |
plot_by_woe(df_temp, 90) | |
plot_by_woe(df_temp.iloc[3: , :], 90) | |
df_inputs_prepr['months_since_issue_d:<38'] = np.where(df_inputs_prepr['months_since_issue_d'].isin(range(38)), 1, 0) | |
df_inputs_prepr['months_since_issue_d:38-39'] = np.where(df_inputs_prepr['months_since_issue_d'].isin(range(38, 40)), 1, 0) | |
df_inputs_prepr['months_since_issue_d:40-41'] = np.where(df_inputs_prepr['months_since_issue_d'].isin(range(40, 42)), 1, 0) | |
df_inputs_prepr['months_since_issue_d:42-48'] = np.where(df_inputs_prepr['months_since_issue_d'].isin(range(42, 49)), 1, 0) | |
df_inputs_prepr['months_since_issue_d:49-52'] = np.where(df_inputs_prepr['months_since_issue_d'].isin(range(49, 53)), 1, 0) | |
df_inputs_prepr['months_since_issue_d:53-64'] = np.where(df_inputs_prepr['months_since_issue_d'].isin(range(53, 65)), 1, 0) | |
df_inputs_prepr['months_since_issue_d:65-84'] = np.where(df_inputs_prepr['months_since_issue_d'].isin(range(65, 85)), 1, 0) | |
df_inputs_prepr['months_since_issue_d:>84'] = np.where(df_inputs_prepr['months_since_issue_d'].isin(range(85, int(df_inputs_prepr['months_since_issue_d'].max()))), 1, 0) | |
df_inputs_prepr['int_rate_factor'] = pd.cut(df_inputs_prepr['int_rate'], 50) | |
df_temp = woe_ordered_continuous(df_inputs_prepr, 'int_rate_factor', df_targets_prepr) | |
plot_by_woe(df_temp, 90) | |
df_inputs_prepr['int_rate:<9.548'] = np.where((df_inputs_prepr['int_rate'] <= 9.548), 1, 0) | |
df_inputs_prepr['int_rate:9.548-12.025'] = np.where((df_inputs_prepr['int_rate'] > 9.548) & (df_inputs_prepr['int_rate'] <= 12.025), 1, 0) | |
df_inputs_prepr['int_rate:12.025-15.74'] = np.where((df_inputs_prepr['int_rate'] > 12.025) & (df_inputs_prepr['int_rate'] <= 15.74), 1, 0) | |
df_inputs_prepr['int_rate:15.74-20.281'] = np.where((df_inputs_prepr['int_rate'] > 15.74) & (df_inputs_prepr['int_rate'] <= 20.281), 1, 0) | |
df_inputs_prepr['int_rate:>20.281'] = np.where((df_inputs_prepr['int_rate'] > 20.281), 1, 0) | |
df_inputs_prepr['funded_amnt_factor'] = pd.cut(df_inputs_prepr['funded_amnt'], 50) | |
df_temp = woe_ordered_continuous(df_inputs_prepr, 'funded_amnt_factor', df_targets_prepr) | |
plot_by_woe(df_temp, 90) | |
df_inputs_prepr['months_since_earliest_cr_line'].unique() | |
df_inputs_prepr['months_since_earliest_cr_line_factor'] = pd.cut(df_inputs_prepr['months_since_earliest_cr_line'], 50) | |
df_temp = woe_ordered_continuous(df_inputs_prepr, 'months_since_earliest_cr_line_factor', df_targets_prepr) | |
plot_by_woe(df_temp, 90) | |
df_inputs_prepr['months_since_earliest_cr_line:<140'] = np.where(df_inputs_prepr['months_since_earliest_cr_line'].isin(range(140)), 1, 0) | |
df_inputs_prepr['months_since_earliest_cr_line:141-164'] = np.where(df_inputs_prepr['months_since_earliest_cr_line'].isin(range(140, 165)), 1, 0) | |
df_inputs_prepr['months_since_earliest_cr_line:165-247'] = np.where(df_inputs_prepr['months_since_earliest_cr_line'].isin(range(165, 248)), 1, 0) | |
df_inputs_prepr['months_since_earliest_cr_line:248-270'] = np.where(df_inputs_prepr['months_since_earliest_cr_line'].isin(range(248, 271)), 1, 0) | |
df_inputs_prepr['months_since_earliest_cr_line:271-352'] = np.where(df_inputs_prepr['months_since_earliest_cr_line'].isin(range(271, 353)), 1, 0) | |
df_inputs_prepr['months_since_earliest_cr_line:>352'] = np.where(df_inputs_prepr['months_since_earliest_cr_line'].isin(range(353, int(df_inputs_prepr['months_since_earliest_cr_line'].max()))), 1, 0) | |
df_inputs_prepr['installment'].unique() | |
df_inputs_prepr['installment_factor'] = pd.cut(df_inputs_prepr['installment'], 50) | |
df_temp = woe_ordered_continuous(df_inputs_prepr, 'installment_factor', df_targets_prepr) | |
plot_by_woe(df_temp, 90) | |
df_inputs_prepr['delinq_2yrs'].unique() | |
df_temp = woe_ordered_continuous(df_inputs_prepr, 'delinq_2yrs', df_targets_prepr) | |
plot_by_woe(df_temp, 90) | |
df_inputs_prepr['delinq_2yrs:0'] = np.where((df_inputs_prepr['delinq_2yrs'] == 0), 1, 0) | |
df_inputs_prepr['delinq_2yrs:1-3'] = np.where((df_inputs_prepr['delinq_2yrs'] >= 1) & (df_inputs_prepr['delinq_2yrs'] <= 3), 1, 0) | |
df_inputs_prepr['delinq_2yrs:>=4'] = np.where((df_inputs_prepr['delinq_2yrs'] > 9), 1, 0) | |
df_inputs_prepr['inq_last_6mths'].unique() | |
df_temp = woe_ordered_continuous(df_inputs_prepr, 'inq_last_6mths', df_targets_prepr) | |
plot_by_woe(df_temp, 90) | |
df_inputs_prepr['inq_last_6mths:0'] = np.where((df_inputs_prepr['inq_last_6mths'] == 0), 1, 0) | |
df_inputs_prepr['inq_last_6mths:1-2'] = np.where((df_inputs_prepr['inq_last_6mths'] >= 1) & (df_inputs_prepr['inq_last_6mths'] <= 2), 1, 0) | |
df_inputs_prepr['inq_last_6mths:3-6'] = np.where((df_inputs_prepr['inq_last_6mths'] >= 3) & (df_inputs_prepr['inq_last_6mths'] <= 6), 1, 0) | |
df_inputs_prepr['inq_last_6mths:>=6'] = np.where((df_inputs_prepr['inq_last_6mths'] > 6), 1, 0) | |
df_inputs_prepr['open_acc'].unique() | |
df_temp = woe_ordered_continuous(df_inputs_prepr, 'open_acc', df_targets_prepr) | |
plot_by_woe(df_temp, 90) | |
plot_by_woe(df_temp.iloc[:40, :], 90) | |
df_inputs_prepr['open_acc:0'] = np.where((df_inputs_prepr['open_acc'] == 0), 1, 0) | |
df_inputs_prepr['open_acc:1-3'] = np.where((df_inputs_prepr['open_acc'] >= 1) & (df_inputs_prepr['open_acc'] <= 3), 1, 0) | |
df_inputs_prepr['open_acc:4-12'] = np.where((df_inputs_prepr['open_acc'] >= 4) & (df_inputs_prepr['open_acc'] <= 12), 1, 0) | |
df_inputs_prepr['open_acc:13-17'] = np.where((df_inputs_prepr['open_acc'] >= 13) & (df_inputs_prepr['open_acc'] <= 17), 1, 0) | |
df_inputs_prepr['open_acc:18-22'] = np.where((df_inputs_prepr['open_acc'] >= 18) & (df_inputs_prepr['open_acc'] <= 22), 1, 0) | |
df_inputs_prepr['open_acc:23-25'] = np.where((df_inputs_prepr['open_acc'] >= 23) & (df_inputs_prepr['open_acc'] <= 25), 1, 0) | |
df_inputs_prepr['open_acc:26-30'] = np.where((df_inputs_prepr['open_acc'] >= 26) & (df_inputs_prepr['open_acc'] <= 30), 1, 0) | |
df_inputs_prepr['open_acc:>=31'] = np.where((df_inputs_prepr['open_acc'] >= 31), 1, 0) | |
df_inputs_prepr['pub_rec'].unique() | |
df_temp = woe_ordered_continuous(df_inputs_prepr, 'pub_rec', df_targets_prepr) | |
plot_by_woe(df_temp, 90) | |
df_inputs_prepr['pub_rec:0-2'] = np.where((df_inputs_prepr['pub_rec'] >= 0) & (df_inputs_prepr['pub_rec'] <= 2), 1, 0) | |
df_inputs_prepr['pub_rec:3-4'] = np.where((df_inputs_prepr['pub_rec'] >= 3) & (df_inputs_prepr['pub_rec'] <= 4), 1, 0) | |
df_inputs_prepr['pub_rec:>=5'] = np.where((df_inputs_prepr['pub_rec'] >= 5), 1, 0) | |
df_inputs_prepr['total_acc'].unique() | |
df_inputs_prepr['total_acc_factor'] = pd.cut(df_inputs_prepr['total_acc'], 50) | |
df_temp = woe_ordered_continuous(df_inputs_prepr, 'total_acc_factor', df_targets_prepr) | |
plot_by_woe(df_temp, 90) | |
df_inputs_prepr['total_acc:<=27'] = np.where((df_inputs_prepr['total_acc'] <= 27), 1, 0) | |
df_inputs_prepr['total_acc:28-51'] = np.where((df_inputs_prepr['total_acc'] >= 28) & (df_inputs_prepr['total_acc'] <= 51), 1, 0) | |
df_inputs_prepr['total_acc:>=52'] = np.where((df_inputs_prepr['total_acc'] >= 52), 1, 0) | |
df_inputs_prepr['acc_now_delinq'].unique() | |
df_temp = woe_ordered_continuous(df_inputs_prepr, 'acc_now_delinq', df_targets_prepr) | |
plot_by_woe(df_temp, 90) | |
df_inputs_prepr['acc_now_delinq:0'] = np.where((df_inputs_prepr['acc_now_delinq'] == 0), 1, 0) | |
df_inputs_prepr['acc_now_delinq:>=1'] = np.where((df_inputs_prepr['acc_now_delinq'] >= 1), 1, 0) | |
df_inputs_prepr['annual_inc'].unique() | |
df_inputs_prepr['annual_inc_factor'] = pd.cut(df_inputs_prepr['annual_inc'], 100) | |
df_temp = woe_ordered_continuous(df_inputs_prepr, 'annual_inc_factor', df_targets_prepr) | |
df_inputs_prepr_temp = df_inputs_prepr.loc[df_inputs_prepr['annual_inc'] <=140000, :] | |
df_inputs_prepr_temp['annual_inc_factor'] = pd.cut(df_inputs_prepr_temp['annual_inc'], 50) | |
df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'annual_inc_factor', df_targets_prepr[df_inputs_prepr_temp.index]) | |
plot_by_woe(df_temp, 90) | |
df_inputs_prepr['annual_inc:<20K'] = np.where((df_inputs_prepr['annual_inc'] <= 20000), 1, 0) | |
df_inputs_prepr['annual_inc:20K-30K'] = np.where((df_inputs_prepr['annual_inc'] > 20000) & (df_inputs_prepr['annual_inc'] <= 30000), 1, 0) | |
df_inputs_prepr['annual_inc:30K-40K'] = np.where((df_inputs_prepr['annual_inc'] > 30000) & (df_inputs_prepr['annual_inc'] <= 40000), 1, 0) | |
df_inputs_prepr['annual_inc:40K-50K'] = np.where((df_inputs_prepr['annual_inc'] > 40000) & (df_inputs_prepr['annual_inc'] <= 50000), 1, 0) | |
df_inputs_prepr['annual_inc:50K-60K'] = np.where((df_inputs_prepr['annual_inc'] > 50000) & (df_inputs_prepr['annual_inc'] <= 60000), 1, 0) | |
df_inputs_prepr['annual_inc:60K-70K'] = np.where((df_inputs_prepr['annual_inc'] > 60000) & (df_inputs_prepr['annual_inc'] <= 70000), 1, 0) | |
df_inputs_prepr['annual_inc:70K-80K'] = np.where((df_inputs_prepr['annual_inc'] > 70000) & (df_inputs_prepr['annual_inc'] <= 80000), 1, 0) | |
df_inputs_prepr['annual_inc:80K-90K'] = np.where((df_inputs_prepr['annual_inc'] > 80000) & (df_inputs_prepr['annual_inc'] <= 90000), 1, 0) | |
df_inputs_prepr['annual_inc:90K-100K'] = np.where((df_inputs_prepr['annual_inc'] > 90000) & (df_inputs_prepr['annual_inc'] <= 100000), 1, 0) | |
df_inputs_prepr['annual_inc:100K-120K'] = np.where((df_inputs_prepr['annual_inc'] > 100000) & (df_inputs_prepr['annual_inc'] <= 120000), 1, 0) | |
df_inputs_prepr['annual_inc:120K-140K'] = np.where((df_inputs_prepr['annual_inc'] > 120000) & (df_inputs_prepr['annual_inc'] <= 140000), 1, 0) | |
df_inputs_prepr['annual_inc:>140K'] = np.where((df_inputs_prepr['annual_inc'] > 140000), 1, 0) | |
df_inputs_prepr_temp = df_inputs_prepr[pd.notnull(df_inputs_prepr['mths_since_last_delinq'])] | |
df_inputs_prepr_temp['mths_since_last_delinq_factor'] = pd.cut(df_inputs_prepr_temp['mths_since_last_delinq'], 50) | |
df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'mths_since_last_delinq_factor', df_targets_prepr[df_inputs_prepr_temp.index]) | |
plot_by_woe(df_temp, 90) | |
df_inputs_prepr['mths_since_last_delinq:Missing'] = np.where((df_inputs_prepr['mths_since_last_delinq'].isnull()), 1, 0) | |
df_inputs_prepr['mths_since_last_delinq:0-3'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 0) & (df_inputs_prepr['mths_since_last_delinq'] < 4), 1, 0) | |
df_inputs_prepr['mths_since_last_delinq:4-30'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 4) & (df_inputs_prepr['mths_since_last_delinq'] < 31), 1, 0) | |
df_inputs_prepr['mths_since_last_delinq:31-56'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 31) & (df_inputs_prepr['mths_since_last_delinq'] < 57), 1, 0) | |
df_inputs_prepr['mths_since_last_delinq:>=57'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 57), 1, 0) | |
df_inputs_prepr['dti'].unique() | |
df_inputs_prepr['dti_factor'] = pd.cut(df_inputs_prepr['dti'], 100) | |
df_temp = woe_ordered_continuous(df_inputs_prepr, 'dti_factor', df_targets_prepr) | |
df_inputs_prepr_temp = df_inputs_prepr.loc[df_inputs_prepr['dti'] <= 35, :] | |
df_inputs_prepr_temp['dti_factor'] = pd.cut(df_inputs_prepr_temp['dti'], 50) | |
df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'dti_factor', df_targets_prepr[df_inputs_prepr_temp.index]) | |
plot_by_woe(df_temp, 90) | |
df_inputs_prepr['dti:<=1.4'] = np.where((df_inputs_prepr['dti'] <= 1.4), 1, 0) | |
df_inputs_prepr['dti:1.4-3.5'] = np.where((df_inputs_prepr['dti'] > 1.4) & (df_inputs_prepr['dti'] <= 3.5), 1, 0) | |
df_inputs_prepr['dti:3.5-7.7'] = np.where((df_inputs_prepr['dti'] > 3.5) & (df_inputs_prepr['dti'] <= 7.7), 1, 0) | |
df_inputs_prepr['dti:7.7-10.5'] = np.where((df_inputs_prepr['dti'] > 7.7) & (df_inputs_prepr['dti'] <= 10.5), 1, 0) | |
df_inputs_prepr['dti:10.5-16.1'] = np.where((df_inputs_prepr['dti'] > 10.5) & (df_inputs_prepr['dti'] <= 16.1), 1, 0) | |
df_inputs_prepr['dti:16.1-20.3'] = np.where((df_inputs_prepr['dti'] > 16.1) & (df_inputs_prepr['dti'] <= 20.3), 1, 0) | |
df_inputs_prepr['dti:20.3-21.7'] = np.where((df_inputs_prepr['dti'] > 20.3) & (df_inputs_prepr['dti'] <= 21.7), 1, 0) | |
df_inputs_prepr['dti:21.7-22.4'] = np.where((df_inputs_prepr['dti'] > 21.7) & (df_inputs_prepr['dti'] <= 22.4), 1, 0) | |
df_inputs_prepr['dti:22.4-35'] = np.where((df_inputs_prepr['dti'] > 22.4) & (df_inputs_prepr['dti'] <= 35), 1, 0) | |
df_inputs_prepr['dti:>35'] = np.where((df_inputs_prepr['dti'] > 35), 1, 0) | |
df_inputs_prepr_temp = df_inputs_prepr[pd.notnull(df_inputs_prepr['mths_since_last_record'])] | |
df_inputs_prepr_temp['mths_since_last_record_factor'] = pd.cut(df_inputs_prepr_temp['mths_since_last_record'], 50) | |
df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'mths_since_last_record_factor', df_targets_prepr[df_inputs_prepr_temp.index]) | |
plot_by_woe(df_temp, 90) | |
df_inputs_prepr['mths_since_last_record:Missing'] = np.where((df_inputs_prepr['mths_since_last_record'].isnull()), 1, 0) | |
df_inputs_prepr['mths_since_last_record:0-2'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 0) & (df_inputs_prepr['mths_since_last_record'] <= 2), 1, 0) | |
df_inputs_prepr['mths_since_last_record:3-20'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 3) & (df_inputs_prepr['mths_since_last_record'] <= 20), 1, 0) | |
df_inputs_prepr['mths_since_last_record:21-31'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 21) & (df_inputs_prepr['mths_since_last_record'] <= 31), 1, 0) | |
df_inputs_prepr['mths_since_last_record:32-80'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 32) & (df_inputs_prepr['mths_since_last_record'] <= 80), 1, 0) | |
df_inputs_prepr['mths_since_last_record:81-86'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 81) & (df_inputs_prepr['mths_since_last_record'] <= 86), 1, 0) | |
df_inputs_prepr['mths_since_last_record:>=86'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 86), 1, 0) | |
#Preprocessing the test dataset | |
#loan_data_inputs_train = df_inputs_prepr | |
loan_data_inputs_test = df_inputs_prepr | |
loan_data_inputs_train.to_csv('loan_data_inputs_train.csv') | |
loan_data_targets_train.to_csv('loan_data_targets_train.csv') | |
loan_data_inputs_test.to_csv('loan_data_inputs_test.csv') | |
loan_data_targets_test.to_csv('loan_data_targets_test.csv') | |
#Estimating the PD model using logistic regression | |
loan_data_inputs_train = pd.read_csv('loan_data_inputs_train.csv', index_col = 0) | |
loan_data_targets_train = pd.read_csv('loan_data_targets_train.csv', index_col = 0, header = None) | |
loan_data_inputs_test = pd.read_csv('loan_data_inputs_test.csv', index_col = 0) | |
loan_data_targets_test = pd.read_csv('loan_data_targets_test.csv', index_col = 0, header = None) | |
#Selecting the list of variables and reference categories | |
inputs_train_with_ref_cat = pd.DataFrame(loan_data_inputs_train,columns= ['grade:A', | |
'grade:B', | |
'grade:C', | |
'grade:D', | |
'grade:E', | |
'grade:F', | |
'grade:G', | |
'home_ownership:RENT_OTHER_NONE_ANY', | |
'home_ownership:OWN', | |
'home_ownership:MORTGAGE', | |
'addr_state:ND_NE_IA_NV_FL_HI_AL', | |
'addr_state:NM_VA', | |
'addr_state:NY', | |
'addr_state:OK_TN_MO_LA_MD_NC', | |
'addr_state:CA', | |
'addr_state:UT_KY_AZ_NJ', | |
'addr_state:AR_MI_PA_OH_MN', | |
'addr_state:RI_MA_DE_SD_IN', | |
'addr_state:GA_WA_OR', | |
'addr_state:WI_MT', | |
'addr_state:TX', | |
'addr_state:IL_CT', | |
'addr_state:KS_SC_CO_VT_AK_MS', | |
'addr_state:WV_NH_WY_DC_ME_ID', | |
'verification_status:Not Verified', | |
'verification_status:Source Verified', | |
'verification_status:Verified', | |
'purpose:educ__sm_b__wedd__ren_en__mov__house', | |
'purpose:credit_card', | |
'purpose:debt_consolidation', | |
'purpose:oth__med__vacation', | |
'purpose:major_purch__car__home_impr', | |
'initial_list_status:f', | |
'initial_list_status:w', | |
'term:36', | |
'term:60', | |
'emp_length:0', | |
'emp_length:1', | |
'emp_length:2-4', | |
'emp_length:5-6', | |
'emp_length:7-9', | |
'emp_length:10', | |
'months_since_issue_d:<38', | |
'months_since_issue_d:38-39', | |
'months_since_issue_d:40-41', | |
'months_since_issue_d:42-48', | |
'months_since_issue_d:49-52', | |
'months_since_issue_d:53-64', | |
'months_since_issue_d:65-84', | |
'months_since_issue_d:>84', | |
'int_rate:<9.548', | |
'int_rate:9.548-12.025', | |
'int_rate:12.025-15.74', | |
'int_rate:15.74-20.281', | |
'int_rate:>20.281', | |
'months_since_earliest_cr_line:<140', | |
'months_since_earliest_cr_line:141-164', | |
'months_since_earliest_cr_line:165-247', | |
'months_since_earliest_cr_line:248-270', | |
'months_since_earliest_cr_line:271-352', | |
'months_since_earliest_cr_line:>352', | |
'delinq_2yrs:0', | |
'delinq_2yrs:1-3', | |
'delinq_2yrs:>=4', | |
'inq_last_6mths:0', | |
'inq_last_6mths:1-2', | |
'inq_last_6mths:3-6', | |
'inq_last_6mths:>6', | |
'open_acc:0', | |
'open_acc:1-3', | |
'open_acc:4-12', | |
'open_acc:13-17', | |
'open_acc:18-22', | |
'open_acc:23-25', | |
'open_acc:26-30', | |
'open_acc:>=31', | |
'pub_rec:0-2', | |
'pub_rec:3-4', | |
'pub_rec:>=5', | |
'total_acc:<=27', | |
'total_acc:28-51', | |
'total_acc:>=52', | |
'acc_now_delinq:0', | |
'acc_now_delinq:>=1', | |
'annual_inc:<20K', | |
'annual_inc:20K-30K', | |
'annual_inc:30K-40K', | |
'annual_inc:40K-50K', | |
'annual_inc:50K-60K', | |
'annual_inc:60K-70K', | |
'annual_inc:70K-80K', | |
'annual_inc:80K-90K', | |
'annual_inc:90K-100K', | |
'annual_inc:100K-120K', | |
'annual_inc:120K-140K', | |
'annual_inc:>140K', | |
'dti:<=1.4', | |
'dti:1.4-3.5', | |
'dti:3.5-7.7', | |
'dti:7.7-10.5', | |
'dti:10.5-16.1', | |
'dti:16.1-20.3', | |
'dti:20.3-21.7', | |
'dti:21.7-22.4', | |
'dti:22.4-35', | |
'dti:>35', | |
'mths_since_last_delinq:Missing', | |
'mths_since_last_delinq:0-3', | |
'mths_since_last_delinq:4-30', | |
'mths_since_last_delinq:31-56', | |
'mths_since_last_delinq:>=57', | |
'mths_since_last_record:Missing', | |
'mths_since_last_record:0-2', | |
'mths_since_last_record:3-20', | |
'mths_since_last_record:21-31', | |
'mths_since_last_record:32-80', | |
'mths_since_last_record:81-86', | |
'mths_since_last_record:>=86']) | |
ref_categories = ['grade:G', | |
'home_ownership:RENT_OTHER_NONE_ANY', | |
'addr_state:ND_NE_IA_NV_FL_HI_AL', | |
'verification_status:Verified', | |
'purpose:educ__sm_b__wedd__ren_en__mov__house', | |
'initial_list_status:f', | |
'term:60', | |
'emp_length:0', | |
'months_since_issue_d:>84', | |
'int_rate:>20.281', | |
'months_since_earliest_cr_line:<140', | |
'delinq_2yrs:>=4', | |
'inq_last_6mths:>6', | |
'open_acc:0', | |
'pub_rec:0-2', | |
'total_acc:<=27', | |
'acc_now_delinq:0', | |
'annual_inc:<20K', | |
'dti:>35', | |
'mths_since_last_delinq:0-3', | |
'mths_since_last_record:0-2'] | |
inputs_train_with_ref_cat.to_csv('inputs_train_with_ref_cat.csv') | |
inputs_train = inputs_train_with_ref_cat.drop(ref_categories, axis = 1) | |
inputs_train.to_csv('inputs_train.csv') | |
# Fitting the model | |
from sklearn.linear_model import LogisticRegression | |
from sklearn import metrics | |
reg = LogisticRegression() | |
reg.fit(inputs_train, loan_data_targets_train) | |
reg.intercept_ | |
reg.coef_ | |
feature_name = inputs_train.columns.values | |
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name) | |
summary_table['Coefficients'] = np.transpose(reg.coef_) | |
summary_table.index = summary_table.index + 1 | |
summary_table.loc[0] = ['Intercept', reg.intercept_[0]] | |
summary_table = summary_table.sort_index() | |
summary_table | |
from sklearn import linear_model | |
import scipy.stats as stat | |
class LogisticRegression_with_p_values: | |
def __init__(self, *args, **kwargs): | |
self.model = linear_model.LogisticRegression(*args, **kwargs) | |
def fit(self, X, y): | |
self.model.fit(X, y) | |
denom = (2.0 *(1.0 + np.cosh(self.model.decision_function(X)))) | |
denom = np.tile(denom, (X.shape[1],1)).T | |
F_ij = np.dot((X / denom).T, X) | |
Cramer_Rao = np.linalg.inv(F_ij) | |
sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao)) | |
z_scores = self.model.coef_[0] / sigma_estimates | |
p_values = [stat.norm.sf(abs(x)) * 2 for x in z_scores] | |
self.coef_ = self.model.coef_ | |
self.intercept_ = self.model.intercept_ | |
self.p_values = p_values | |
reg = LogisticRegression_with_p_values() | |
reg.fit(inputs_train, loan_data_targets_train) | |
feature_name = inputs_train.columns.values | |
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name) | |
summary_table['Coefficients'] = np.transpose(reg.coef_) | |
summary_table.index = summary_table.index + 1 | |
summary_table.loc[0] = ['Intercept', reg.intercept_[0]] | |
summary_table = summary_table.sort_index() | |
p_values = reg.p_values | |
p_values = np.append(np.nan, np.array(p_values)) | |
summary_table['p_values'] = p_values | |
summary_table | |
# After checking the p-values, we drop variables - delinq_2yrs, open_acc, pub_rec, total_acc | |
inputs_train_with_ref_cat = pd.DataFrame(loan_data_inputs_train,columns= ['grade:A', | |
'grade:B', | |
'grade:C', | |
'grade:D', | |
'grade:E', | |
'grade:F', | |
'grade:G', | |
'home_ownership:RENT_OTHER_NONE_ANY', | |
'home_ownership:OWN', | |
'home_ownership:MORTGAGE', | |
'addr_state:ND_NE_IA_NV_FL_HI_AL', | |
'addr_state:NM_VA', | |
'addr_state:NY', | |
'addr_state:OK_TN_MO_LA_MD_NC', | |
'addr_state:CA', | |
'addr_state:UT_KY_AZ_NJ', | |
'addr_state:AR_MI_PA_OH_MN', | |
'addr_state:RI_MA_DE_SD_IN', | |
'addr_state:GA_WA_OR', | |
'addr_state:WI_MT', | |
'addr_state:TX', | |
'addr_state:IL_CT', | |
'addr_state:KS_SC_CO_VT_AK_MS', | |
'addr_state:WV_NH_WY_DC_ME_ID', | |
'verification_status:Not Verified', | |
'verification_status:Source Verified', | |
'verification_status:Verified', | |
'purpose:educ__sm_b__wedd__ren_en__mov__house', | |
'purpose:credit_card', | |
'purpose:debt_consolidation', | |
'purpose:oth__med__vacation', | |
'purpose:major_purch__car__home_impr', | |
'initial_list_status:f', | |
'initial_list_status:w', | |
'term:36', | |
'term:60', | |
'emp_length:0', | |
'emp_length:1', | |
'emp_length:2-4', | |
'emp_length:5-6', | |
'emp_length:7-9', | |
'emp_length:10', | |
'months_since_issue_d:<38', | |
'months_since_issue_d:38-39', | |
'months_since_issue_d:40-41', | |
'months_since_issue_d:42-48', | |
'months_since_issue_d:49-52', | |
'months_since_issue_d:53-64', | |
'months_since_issue_d:65-84', | |
'months_since_issue_d:>84', | |
'int_rate:<9.548', | |
'int_rate:9.548-12.025', | |
'int_rate:12.025-15.74', | |
'int_rate:15.74-20.281', | |
'int_rate:>20.281', | |
'months_since_earliest_cr_line:<140', | |
'months_since_earliest_cr_line:141-164', | |
'months_since_earliest_cr_line:165-247', | |
'months_since_earliest_cr_line:248-270', | |
'months_since_earliest_cr_line:271-352', | |
'months_since_earliest_cr_line:>352', | |
'inq_last_6mths:0', | |
'inq_last_6mths:1-2', | |
'inq_last_6mths:3-6', | |
'inq_last_6mths:>6', | |
'acc_now_delinq:0', | |
'acc_now_delinq:>=1', | |
'annual_inc:<20K', | |
'annual_inc:20K-30K', | |
'annual_inc:30K-40K', | |
'annual_inc:40K-50K', | |
'annual_inc:50K-60K', | |
'annual_inc:60K-70K', | |
'annual_inc:70K-80K', | |
'annual_inc:80K-90K', | |
'annual_inc:90K-100K', | |
'annual_inc:100K-120K', | |
'annual_inc:120K-140K', | |
'annual_inc:>140K', | |
'dti:<=1.4', | |
'dti:1.4-3.5', | |
'dti:3.5-7.7', | |
'dti:7.7-10.5', | |
'dti:10.5-16.1', | |
'dti:16.1-20.3', | |
'dti:20.3-21.7', | |
'dti:21.7-22.4', | |
'dti:22.4-35', | |
'dti:>35', | |
'mths_since_last_delinq:Missing', | |
'mths_since_last_delinq:0-3', | |
'mths_since_last_delinq:4-30', | |
'mths_since_last_delinq:31-56', | |
'mths_since_last_delinq:>=57', | |
'mths_since_last_record:Missing', | |
'mths_since_last_record:0-2', | |
'mths_since_last_record:3-20', | |
'mths_since_last_record:21-31', | |
'mths_since_last_record:32-80', | |
'mths_since_last_record:81-86', | |
'mths_since_last_record:>=86']) | |
ref_categories = ['grade:G', | |
'home_ownership:RENT_OTHER_NONE_ANY', | |
'addr_state:ND_NE_IA_NV_FL_HI_AL', | |
'verification_status:Verified', | |
'purpose:educ__sm_b__wedd__ren_en__mov__house', | |
'initial_list_status:f', | |
'term:60', | |
'emp_length:0', | |
'months_since_issue_d:>84', | |
'int_rate:>20.281', | |
'months_since_earliest_cr_line:<140', | |
'inq_last_6mths:>6', | |
'acc_now_delinq:0', | |
'annual_inc:<20K', | |
'dti:>35', | |
'mths_since_last_delinq:0-3', | |
'mths_since_last_record:0-2'] | |
inputs_train = inputs_train_with_ref_cat.drop(ref_categories, axis = 1) | |
# Fitting the model again with new list of variables | |
reg2 = LogisticRegression_with_p_values() | |
reg2.fit(inputs_train, loan_data_targets_train) | |
feature_name = inputs_train.columns.values | |
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name) | |
summary_table['Coefficients'] = np.transpose(reg2.coef_) | |
summary_table.index = summary_table.index + 1 | |
summary_table.loc[0] = ['Intercept', reg2.intercept_[0]] | |
summary_table = summary_table.sort_index() | |
p_values = reg2.p_values | |
p_values = np.append(np.nan, np.array(p_values)) | |
summary_table['p_values'] = p_values | |
summary_table | |
# Part II: Out-of-sample PD model validation | |
inputs_test_with_ref_cat = pd.DataFrame(loan_data_inputs_test, columns= ['grade:A', | |
'grade:B', | |
'grade:C', | |
'grade:D', | |
'grade:E', | |
'grade:F', | |
'grade:G', | |
'home_ownership:RENT_OTHER_NONE_ANY', | |
'home_ownership:OWN', | |
'home_ownership:MORTGAGE', | |
'addr_state:ND_NE_IA_NV_FL_HI_AL', | |
'addr_state:NM_VA', | |
'addr_state:NY', | |
'addr_state:OK_TN_MO_LA_MD_NC', | |
'addr_state:CA', | |
'addr_state:UT_KY_AZ_NJ', | |
'addr_state:AR_MI_PA_OH_MN', | |
'addr_state:RI_MA_DE_SD_IN', | |
'addr_state:GA_WA_OR', | |
'addr_state:WI_MT', | |
'addr_state:TX', | |
'addr_state:IL_CT', | |
'addr_state:KS_SC_CO_VT_AK_MS', | |
'addr_state:WV_NH_WY_DC_ME_ID', | |
'verification_status:Not Verified', | |
'verification_status:Source Verified', | |
'verification_status:Verified', | |
'purpose:educ__sm_b__wedd__ren_en__mov__house', | |
'purpose:credit_card', | |
'purpose:debt_consolidation', | |
'purpose:oth__med__vacation', | |
'purpose:major_purch__car__home_impr', | |
'initial_list_status:f', | |
'initial_list_status:w', | |
'term:36', | |
'term:60', | |
'emp_length:0', | |
'emp_length:1', | |
'emp_length:2-4', | |
'emp_length:5-6', | |
'emp_length:7-9', | |
'emp_length:10', | |
'months_since_issue_d:<38', | |
'months_since_issue_d:38-39', | |
'months_since_issue_d:40-41', | |
'months_since_issue_d:42-48', | |
'months_since_issue_d:49-52', | |
'months_since_issue_d:53-64', | |
'months_since_issue_d:65-84', | |
'months_since_issue_d:>84', | |
'int_rate:<9.548', | |
'int_rate:9.548-12.025', | |
'int_rate:12.025-15.74', | |
'int_rate:15.74-20.281', | |
'int_rate:>20.281', | |
'months_since_earliest_cr_line:<140', | |
'months_since_earliest_cr_line:141-164', | |
'months_since_earliest_cr_line:165-247', | |
'months_since_earliest_cr_line:248-270', | |
'months_since_earliest_cr_line:271-352', | |
'months_since_earliest_cr_line:>352', | |
'inq_last_6mths:0', | |
'inq_last_6mths:1-2', | |
'inq_last_6mths:3-6', | |
'inq_last_6mths:>6', | |
'acc_now_delinq:0', | |
'acc_now_delinq:>=1', | |
'annual_inc:<20K', | |
'annual_inc:20K-30K', | |
'annual_inc:30K-40K', | |
'annual_inc:40K-50K', | |
'annual_inc:50K-60K', | |
'annual_inc:60K-70K', | |
'annual_inc:70K-80K', | |
'annual_inc:80K-90K', | |
'annual_inc:90K-100K', | |
'annual_inc:100K-120K', | |
'annual_inc:120K-140K', | |
'annual_inc:>140K', | |
'dti:<=1.4', | |
'dti:1.4-3.5', | |
'dti:3.5-7.7', | |
'dti:7.7-10.5', | |
'dti:10.5-16.1', | |
'dti:16.1-20.3', | |
'dti:20.3-21.7', | |
'dti:21.7-22.4', | |
'dti:22.4-35', | |
'dti:>35', | |
'mths_since_last_delinq:Missing', | |
'mths_since_last_delinq:0-3', | |
'mths_since_last_delinq:4-30', | |
'mths_since_last_delinq:31-56', | |
'mths_since_last_delinq:>=57', | |
'mths_since_last_record:Missing', | |
'mths_since_last_record:0-2', | |
'mths_since_last_record:3-20', | |
'mths_since_last_record:21-31', | |
'mths_since_last_record:32-80', | |
'mths_since_last_record:81-86', | |
'mths_since_last_record:>=86']) | |
inputs_test = inputs_test_with_ref_cat.drop(ref_categories, axis = 1) | |
y_hat_test = reg2.model.predict(inputs_test) | |
y_hat_test_proba = reg2.model.predict_proba(inputs_test) | |
y_hat_test_proba = y_hat_test_proba[:][:, 1] # contains only the probability of no default, i.e. for good borrowers | |
loan_data_targets_test_temp = loan_data_targets_test | |
loan_data_targets_test_temp.reset_index(drop = True, inplace = True) | |
df_actual_predicted_probs = pd.concat([loan_data_targets_test_temp, pd.DataFrame(y_hat_test_proba)], axis = 1) | |
df_actual_predicted_probs.columns = ['loan_data_targets_test', 'y_hat_test_proba'] | |
df_actual_predicted_probs.index = loan_data_inputs_test.index | |
df_actual_predicted_probs.head() | |
#Evaluation of model performance | |
tr = 0.5 # a 0.5 threshold will give a high accuracy, however a lot of false positives. A very conservative threshold will reduce false positives, but wil reduce accuracy as well | |
df_actual_predicted_probs['y_hat_test'] = np.where(df_actual_predicted_probs['y_hat_test_proba'] > tr, 1, 0) | |
pd.crosstab(df_actual_predicted_probs['loan_data_targets_test'], df_actual_predicted_probs['y_hat_test'], | |
rownames = ['Actual'], colnames = ['Predicted']) | |
pd.crosstab(df_actual_predicted_probs['loan_data_targets_test'], df_actual_predicted_probs['y_hat_test'], | |
rownames = ['Actual'], colnames = ['Predicted'])/df_actual_predicted_probs.shape[0] | |
overall_accuracy = (pd.crosstab(df_actual_predicted_probs['loan_data_targets_test'], df_actual_predicted_probs['y_hat_test'], | |
rownames = ['Actual'], colnames = ['Predicted'])/df_actual_predicted_probs.shape[0]).iloc[0, 0] + (pd.crosstab(df_actual_predicted_probs['loan_data_targets_test'], df_actual_predicted_probs['y_hat_test'], | |
rownames = ['Actual'], colnames = ['Predicted'])/df_actual_predicted_probs.shape[0]).iloc[1, 1] | |
# ROC curve | |
from sklearn.metrics import roc_curve, roc_auc_score | |
roc_curve(df_actual_predicted_probs['loan_data_targets_test'], df_actual_predicted_probs['y_hat_test_proba']) | |
fpr, tpr, thresholds = roc_curve(df_actual_predicted_probs['loan_data_targets_test'], df_actual_predicted_probs['y_hat_test_proba']) | |
plt.plot(fpr, tpr) | |
plt.plot(fpr, fpr, linestyle = '--', color = 'k') | |
plt.xlabel('False Positive Rate') | |
plt.ylabel('True Positive Rate') | |
plt.title('ROC Curve') | |
AUROC = roc_auc_score(df_actual_predicted_probs['loan_data_targets_test'], df_actual_predicted_probs['y_hat_test_proba']) | |
# Gini and Kolmogorov-Smirnov coefficients | |
df_actual_predicted_probs = df_actual_predicted_probs.sort_values('y_hat_test_proba') | |
df_actual_predicted_probs | |
df_actual_predicted_probs = df_actual_predicted_probs.reset_index() | |
df_actual_predicted_probs['Cumulative N Population'] = df_actual_predicted_probs.index + 1 | |
df_actual_predicted_probs['Cumulative N Good'] = df_actual_predicted_probs['loan_data_targets_test'].cumsum() | |
df_actual_predicted_probs['Cumulative N Bad'] = df_actual_predicted_probs['Cumulative N Population'] - df_actual_predicted_probs['loan_data_targets_test'].cumsum() | |
df_actual_predicted_probs['Cumulative Perc Population'] = df_actual_predicted_probs['Cumulative N Population'] / (df_actual_predicted_probs.shape[0]) | |
df_actual_predicted_probs['Cumulative Perc Good'] = df_actual_predicted_probs['Cumulative N Good'] / (df_actual_predicted_probs['loan_data_targets_test'].sum()) | |
df_actual_predicted_probs['Cumulative Perc Bad'] = df_actual_predicted_probs['Cumulative N Bad'] / (df_actual_predicted_probs.shape[0] - df_actual_predicted_probs['loan_data_targets_test'].sum()) | |
plt.plot(df_actual_predicted_probs['Cumulative Perc Population'], df_actual_predicted_probs['Cumulative Perc Bad']) | |
plt.plot(df_actual_predicted_probs['Cumulative Perc Population'], df_actual_predicted_probs['Cumulative Perc Population'], linestyle ='--', color ='k') | |
plt.xlabel('Cumulative % Population') | |
plt.ylabel('Cumulative % Bad') | |
plt.title('Gini') | |
Gini = AUROC *2 - 1 | |
plt.plot(df_actual_predicted_probs['y_hat_test_proba'], df_actual_predicted_probs['Cumulative Perc Bad'], color = 'r') | |
plt.plot(df_actual_predicted_probs['y_hat_test_proba'], df_actual_predicted_probs['Cumulative Perc Good'], color = 'b') | |
plt.xlabel('Estimated Probability for being Good') | |
plt.ylabel('Cumulative %') | |
plt.title('Kolmogorov-Smirnov') | |
KS = max( df_actual_predicted_probs['Cumulative Perc Bad'] - df_actual_predicted_probs['Cumulative Perc Good']) | |
# Creating a scorecard from the PD model | |
df_ref_categories = pd.DataFrame(ref_categories, columns = ['Feature name']) | |
df_ref_categories['Categories'] = 0 | |
df_ref_categories['p_values'] = np.nan | |
df_scorecard = pd.concat([summary_table, df_ref_categories]) | |
df_scorecard = df_scorecard.reset_index() | |
df_scorecard['Original feature name'] = df_scorecard['Feature name'].str.split(':').str[0] | |
min_score = 300 | |
max_score = 850 | |
min_sum_coef = df_scorecard.groupby('Original feature name')['Coefficients'].min().sum() | |
max_sum_coef = df_scorecard.groupby('Original feature name')['Coefficients'].max().sum() | |
df_scorecard['Score-Calculation'] = df_scorecard['Coefficients'] * (max_score - min_score)/(max_sum_coef - min_sum_coef) | |
df_scorecard['Score-Calculation'][0] = ((df_scorecard['Coefficients'][0] - min_sum_coef) / (max_sum_coef - min_sum_coef)) * (max_score - min_score) + min_score | |
df_scorecard['Score-Preliminary'] = df_scorecard['Score-Calculation'].round() | |
min_sum_score_prel = df_scorecard.groupby('Original feature name')['Score-Preliminary'].min().sum() | |
max_sum_score_prel = df_scorecard.groupby('Original feature name')['Score-Preliminary'].max().sum() | |
df_scorecard['Difference'] = df_scorecard['Score-Preliminary'] - df_scorecard['Score-Calculation'] | |
df_scorecard['Score-Final'] = df_scorecard['Score-Preliminary'] | |
df_scorecard['Score-Final'][1] = 113 # the index with the largest rounding was set off by -1 to make sure max score is 850 | |
max_sum_score_prel = df_scorecard.groupby('Original feature name')['Score-Final'].max().sum() | |
#Calculating the credit score | |
inputs_test_with_ref_cat_w_intercept = inputs_test_with_ref_cat | |
inputs_test_with_ref_cat_w_intercept.insert(0, 'Intercept', 1) | |
inputs_test_with_ref_cat_w_intercept = inputs_test_with_ref_cat_w_intercept[df_scorecard['Feature name'].values] | |
scorecard_scores = df_scorecard['Score-Final'] | |
inputs_test_with_ref_cat_w_intercept.shape | |
scorecard_scores.shape | |
scorecard_scores = scorecard_scores.values.reshape(102,1) | |
y_scores = inputs_test_with_ref_cat_w_intercept.dot(scorecard_scores) | |
# From Credit score to Probability of Default | |
sum_coef_from_score = ((y_scores - min_score)/(max_score - min_score)) * (max_sum_coef - min_sum_coef) + min_sum_coef | |
y_hat_proba_from_score = np.exp(sum_coef_from_score) / (np.exp(sum_coef_from_score) + 1) | |
#Setting cut-offs calculating approval/rejection rate | |
df_cutoffs = pd.concat([pd.DataFrame(thresholds), pd.DataFrame(fpr), pd.DataFrame(tpr)], axis = 1) | |
df_cutoffs.columns = ['thresholds', 'fpr', 'tpr'] | |
df_cutoffs['thresholds'][0] = 1 - 1/np.power(10, 16) | |
df_cutoffs['Score'] = ((np.log(df_cutoffs['thresholds']/(1 - df_cutoffs['thresholds'])) - min_sum_coef) * ((max_score - min_score) / (max_sum_coef - min_sum_coef)) + min_score).round() | |
df_cutoffs['Score'][0] = max_score | |
def n_approved(p): | |
return np.where(df_actual_predicted_probs['y_hat_test_proba'] >= p, 1, 0).sum() | |
df_cutoffs['N Approved'] = df_cutoffs['thresholds'].apply(n_approved) | |
df_cutoffs['N Rejected'] = df_actual_predicted_probs['y_hat_test_proba'].shape[0] - df_cutoffs['N Approved'] | |
df_cutoffs['Approval Rate'] = df_cutoffs['N Approved'] / df_actual_predicted_probs['y_hat_test_proba'].shape[0] | |
df_cutoffs['Rejection Rate'] = 1 - df_cutoffs['Approval Rate'] | |
inputs_train_with_ref_cat.to_csv('inputs_train_with_ref_cat.csv') | |
df_scorecard.to_csv('df_scorecard.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
loan_data['emp_length'].unique()
hi i am trying to do this model in my school python project still when i try the text above it show error with emp_length could you help me with that