Last active
July 25, 2021 11:25
-
-
Save Chaitali20-gh/87abbf72c4b3375198df473f00db6faa to your computer and use it in GitHub Desktop.
Lending Club loan Default
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
%matplotlib inline | |
import seaborn as sns | |
import datetime | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.metrics import confusion_matrix | |
from sklearn.metrics import classification_report | |
from sklearn.metrics import accuracy_score | |
from sklearn.model_selection import train_test_split | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.naive_bayes import GaussianNB | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc, roc_auc_score, roc_curve, recall_score, classification_report | |
import pickle | |
pd.set_option('display.max_columns', 300) | |
pd.set_option('display.max_rows', 500) | |
# read approved loan data from corresponding excels | |
accepted = pd.read_csv('accepted_2007_to_2018Q4.csv.gz', compression='gzip', low_memory=False) | |
a_lc = accepted.copy() | |
# read rejected loan data from corresponding excels | |
r_lc = pd.read_csv("rejected_2007_to_2018Q4.csv.gz", low_memory=False) | |
#check the approved loan data | |
a_lc.head() | |
# Check the size | |
a_lc.shape | |
# remove not needed columns: | |
a_lc.drop(['all_util', 'bc_util','acc_open_past_24mths','url','revol_bal_joint','sec_app_fico_range_low','sec_app_fico_range_high', | |
'sec_app_earliest_cr_line','sec_app_inq_last_6mths','sec_app_mort_acc','sec_app_open_acc','sec_app_revol_util', | |
'sec_app_open_act_il','sec_app_num_rev_accts','sec_app_chargeoff_within_12_mths','sec_app_collections_12_mths_ex_med', | |
'sec_app_mths_since_last_major_derog','hardship_type','hardship_reason','hardship_status','deferral_term','deferral_term', | |
'hardship_amount','hardship_start_date','hardship_end_date','payment_plan_start_date','hardship_length', | |
'hardship_dpd','hardship_loan_status','orig_projected_additional_accrued_interest','hardship_payoff_balance_amount', | |
'hardship_last_payment_amount','debt_settlement_flag_date','settlement_status','settlement_date','settlement_amount', | |
'settlement_percentage','settlement_term','funded_amnt','funded_amnt_inv','pymnt_plan','pymnt_plan','hardship_flag', | |
'total_pymnt','total_pymnt_inv','total_rec_int','total_rec_late_fee','total_rec_prncp','out_prncp', | |
'out_prncp_inv','recoveries','collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'last_credit_pull_d', | |
'last_fico_range_high', 'last_fico_range_low', 'collections_12_mths_ex_med', 'policy_code', 'acc_now_delinq', | |
'total_rev_hi_lim', 'acc_open_past_24mths','bc_open_to_buy', 'chargeoff_within_12_mths', 'delinq_amnt', | |
'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op','mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mths_since_recent_bc', 'mths_since_recent_inq', | |
'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', | |
'num_op_rev_tl','num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd', | |
'num_tl_90g_dpd_24m', 'num_tl_op_past_12m', 'pct_tl_nvr_dlq','tax_liens', 'tot_hi_cred_lim', | |
'total_bc_limit', 'total_il_high_credit_limit','disbursement_method', | |
'debt_settlement_flag','verification_status_joint','dti_joint','annual_inc_joint', | |
'mths_since_recent_bc_dlq','mths_since_recent_revol_delinq', | |
'next_pymnt_d','mths_since_last_delinq','il_util','mths_since_rcnt_il', | |
'open_acc_6m','total_cu_tl','inq_last_12m','open_il_24m','open_act_il','total_bal_il','open_rv_12m', | |
'open_rv_24m','open_il_12m','emp_title','max_bal_bc','desc','member_id', | |
'num_rev_accts', 'initial_list_status', 'application_type', 'percent_bc_gt_75'], axis=1,inplace=True) | |
# Check missing values count and percent | |
missing= a_lc.isnull().sum().sort_values(ascending=False) | |
percent= (a_lc.isnull().sum()/a_lc.isnull().count()).sort_values(ascending=False)*100 | |
missing_data= pd.concat([missing, percent],axis=1, keys=["Total", "Percent"]) | |
missing_data.head(100) | |
# Removing the 33 rows where home ownership data was missing | |
a_lc = a_lc[-a_lc.home_ownership.isna()] | |
#drop significantly missing values | |
a_lc.drop(['mths_since_last_record','mths_since_last_major_derog'], axis=1, inplace=True) | |
# Checking the dataset for the loan data delinquest over 2 years | |
a_lc.loc[a_lc.delinq_2yrs.isnull() == True] | |
# Checking the dataset how many records are having total account blank | |
a_lc.loc[a_lc.total_acc.isnull() == True] | |
# Missing value imputation | |
# Filling the mode in place of the missing values - categorical features | |
num = ['inq_fi','emp_length','mort_acc','title','pub_rec_bankruptcies','zip_code','int_rate','open_acc','verification_status', | |
'term','grade','sub_grade','home_ownership','loan_status','purpose','addr_state','issue_d','earliest_cr_line'] | |
for i in num: | |
a_lc[i]=a_lc[i].fillna(a_lc[i].mode()[0]) | |
# Missing value imputation | |
# Filling the mode in place of the missing values - numerical features | |
num = ['avg_cur_bal','tot_cur_bal','total_bal_ex_mort','dti','revol_util','total_acc','annual_inc','inq_last_6mths','delinq_2yrs', | |
'open_acc','pub_rec','tot_coll_amt','loan_amnt','fico_range_low','fico_range_high'] | |
for i in num: | |
a_lc[i]=a_lc[i].fillna(a_lc[i].median()) | |
# Re-Check missing values count and percent - No more missing data | |
missing= a_lc.isnull().sum().sort_values(ascending=False) | |
percent= (a_lc.isnull().sum()/a_lc.isnull().count()).sort_values(ascending=False)*100 | |
missing_data= pd.concat([missing, percent],axis=1, keys=["Total", "Percent"]) | |
missing_data | |
# Loan counts group by Loan Status | |
ls = a_lc.groupby('loan_status').count()['loan_amnt'] | |
ls | |
# Loan Status | |
plt.figure(figsize = (8,4)) | |
g = sns.countplot(x="loan_status",data=a_lc, | |
palette="rocket") | |
g.set_xticklabels(g.get_xticklabels(),rotation=75) | |
g.set_title("Loan Status", fontsize=20) | |
g.set_xlabel("Loan Status", fontsize=15) | |
g.set_ylabel("Loan Amount", fontsize=20) | |
# check the current loans | |
current = a_lc.loc[a_lc.loan_status == 'Current'] | |
# Excluding the current loans from dataset | |
a_lc = a_lc[a_lc.loan_status != 'Current'] | |
a_lc.shape | |
# Creating loan_category as an intermediate feature for vizualization purpose | |
a_lc["loan_category"] = np.where((a_lc.loan_status == 'Fully Paid') | | |
(a_lc.loan_status == 'Does not meet the credit policy. Status:Fully Paid'), 'Fully Paid', 'Charged Off') | |
# Excluding Grace period and late payment statuses as those are ambiguous to predict | |
a_lc = a_lc.loc[a_lc['loan_status'].isin(['Fully Paid','Charged Off', | |
'Does not meet the credit policy. Status:Fully Paid', | |
'Does not meet the credit policy. Status:Charged Off', | |
'Default'])] | |
# Sorting the data set by Issue Date | |
a_lc = a_lc.sort_values(by='issue_d') | |
a_lc | |
# Cleaning the data to keep the numeric values | |
a_lc['home_ownership'] = a_lc['home_ownership'].replace(['NONE', 'ANY'], 'OTHER') | |
a_lc['term'] = a_lc['term'].replace({'months':''}, regex = True) | |
a_lc['emp_length'] = a_lc['emp_length'].replace('< 1 year', '0 years') | |
a_lc['emp_length'] = a_lc['emp_length'].replace('10+ years', '10 years') | |
a_lc['emp_length'] = a_lc['emp_length'].replace({'year':'', 's':'', '\+':''}, regex = True) | |
# Checking loan category counts | |
a_lc['loan_category'].value_counts() | |
# Create array for using the function for different plots | |
numeric_var = ['loan_amnt', 'int_rate', 'installment', 'delinq_2yrs','annual_inc','dti', | |
'fico_range_low', 'fico_range_high', 'open_acc', 'total_acc', 'pub_rec', 'revol_bal', 'revol_util', | |
'tot_coll_amt', 'tot_cur_bal', 'inq_last_6mths', 'inq_fi', 'avg_cur_bal', 'mort_acc', | |
'pub_rec_bankruptcies', 'total_bal_ex_mort'] | |
ordinal_var = ['term', 'grade', 'sub_grade', 'emp_length', 'verification_status'] | |
nominal_var = ['home_ownership','purpose', 'title', 'zip_code', 'addr_state'] | |
other = ['id', 'issue_d','loan_category', 'loan_status', 'earliest_cr_line'] | |
#function for numeric features plot | |
def numeric_plot(v): | |
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 10)) | |
sns.distplot(a_lc.loc[a_lc[v].notnull(), v], kde=False, ax=ax1) | |
ax1.set_title(f'Loan Status vs {v}') | |
ax1.set_xlabel(v) | |
ax1.set_ylabel('Count') | |
sns.boxplot(x='loan_category', y=v, data=a_lc, ax=ax2) | |
ax2.set_ylabel('') | |
ax2.set_title(v + ' by Loan Status') | |
ax2.set_xlabel(v) | |
plt.tight_layout() | |
#function for categorical features plot | |
def categorical_plot(v): | |
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) | |
sns.countplot(a_lc[v], palette = 'rocket', ax=ax1) | |
ax1.set_title(f'Loan Status vs {v}') | |
ax1.set_xlabel(v) | |
ax1.set_ylabel('Count') | |
good_rates = a_lc.groupby(v)['loan_category'].value_counts(normalize=True).loc[:,'Fully Paid'] | |
sns.barplot(x=good_rates.index, y=good_rates.values, palette = 'rocket', ax=ax2) | |
ax2.set_ylabel('Fraction of Good Loans') | |
ax2.set_title('Fully Paid Rate by ' + v) | |
ax2.set_xlabel(v) | |
ax2.set_ylim(0,1) | |
plt.tight_layout() | |
# order=sorted(a_lc[v].unique()) | |
# Show the plots | |
for v in numeric_var: | |
numeric_plot(v) | |
#Show the categorical plots | |
for v in ordinal_var: | |
categorical_plot(v) | |
#Categorical plot for home ownership,purpose, title, zip code and address state | |
# nominal_var = ['home_ownership','purpose', 'title', 'zip_code', 'addr_state'] | |
categorical_plot('home_ownership') | |
categorical_plot('purpose') | |
# categorical_plot('title') | |
categorical_plot('addr_state') | |
plt.figure(figsize=(20, 20)) | |
plt.subplot(3, 1, 1) | |
purpose = sns.countplot(x='purpose', data=a_lc, hue='loan_category') | |
purpose.set_xticklabels(purpose.get_xticklabels(), rotation=45); | |
plt.subplot(3, 1, 2) | |
sns.countplot(x='home_ownership', data=a_lc, hue='loan_category') | |
plt.subplot(3, 1, 3) | |
state = sns.countplot(x='addr_state', data=a_lc, hue='loan_category') | |
state.set_xticklabels(state.get_xticklabels(), rotation=45); | |
# sns.countplot(x='title', data=a_lc, hue='loan_category') | |
# Date time extraction | |
import datetime as dt | |
def make_dateval(s): | |
return dt.datetime.strptime(s[-4:]+s[:3]+'01', '%Y%b%d') | |
a_lc.issue_d = a_lc.issue_d.apply(make_dateval) | |
a_lc.earliest_cr_line = a_lc.earliest_cr_line.apply(make_dateval) | |
a_lc['year'] = a_lc.issue_d.apply(lambda x: x.year) | |
a_lc['cred_year'] = a_lc.earliest_cr_line.apply(lambda x: x.year) | |
a_lc['cred_length'] = a_lc['year'] - a_lc['cred_year'] | |
# Calculate yearly count | |
yearly_count = a_lc.groupby('year')['loan_category'].count().reset_index() | |
yearly_count = yearly_count.rename(columns={'loan_category': 'counts'}) | |
yearly_count['ratio'] = yearly_count['counts'] / len(a_lc) | |
yearly_count | |
# Plot for number of loans per year | |
fig, (ax1,ax2) = plt.subplots(1, 2, figsize=(12, 6)) | |
sns.countplot(x='year', data=a_lc, ax=ax1) | |
ax1.set_xlabel('Year', fontsize=14) | |
ax1.set_ylabel('Count', fontsize=14) | |
sns.barplot(x='year', y = 'ratio', data=yearly_count, ax=ax2) | |
ax2.set_xlabel('Year', fontsize=14) | |
ax2.set_ylabel('Ratio', fontsize=14) | |
# Plot for Issue year vs Loan Amount | |
# making issue date as date time | |
a_lc['issue_d'] = pd.to_datetime(a_lc['issue_d']) | |
a_lc = a_lc[pd.notnull(a_lc['issue_d'])] | |
a_lc['issue_year'] = a_lc['issue_d'].dt.year | |
yamnt = pd.DataFrame({'amount' : a_lc.groupby(['issue_year', 'term']).sum()['loan_amnt']}).reset_index() | |
plt.figure(figsize=(12,6)) | |
sns.barplot(x="issue_year", y="amount", | |
hue="term", palette="rocket", linewidth=2.5, | |
data=yamnt) | |
# plot of DTI vs Year | |
ydti = pd.DataFrame({'dti' : a_lc.groupby(['year']).mean()['dti']}).reset_index() | |
plt.figure(figsize=(12,6)) | |
sns.lineplot(x="year", y="dti", | |
palette="rocket", linewidth=2.5, | |
data=ydti) | |
# Frequency distribution of Interest Rate | |
plt.figure(figsize=(12,6)) | |
g = sns.histplot(a_lc, x="int_rate",hue='grade', | |
multiple="stack", | |
palette="rocket", | |
edgecolor=".3",linewidth=.5 | |
) | |
g.set_xlabel("Interest Rate", fontsize=12) | |
g.set_ylabel("Frequency", fontsize=12) | |
g.set_title("Interest Rate Distribuition", fontsize=10) | |
plt.figure(figsize=(12,6)) | |
sns.barplot(x="emp_length", y="grade", | |
hue="term", palette="rocket", linewidth=2.5, | |
data=a_lc) | |
# Creating target feature loan_class to check Good and Bad Loan. Good is 0 and bad is 1 | |
a_lc["loan_class"] = np.where((a_lc.loan_category == 'Fully Paid'), 0, 1) | |
# Check counts of loan_class. Imbalanced data set | |
a_lc.loan_class.value_counts() | |
# Total Loan Amount Cost | |
cost = a_lc.groupby('loan_class').loan_amnt.sum().sort_values(ascending=False) | |
# Showing the amounts in $bn | |
print("Fully paid loans: $"+str(round(cost[0]/1e9,1))+"bn") | |
print("Charged off loans: $"+str(round(cost[1]/1e9,1))+"bn") | |
# What is this as a percentage of the charge off rate, i.e. the cost of risk (CoR)? | |
default_cost = cost[1]/a_lc['loan_amnt'].sum() | |
print("cost of default is "+str(round(default_cost*100,1))+"%") | |
# Remove these columns based on data analysis | |
a_lc.drop(['grade','issue_d', 'loan_status', 'zip_code', 'earliest_cr_line', 'cred_year', 'title'], axis=1, inplace=True) | |
# Evaluating the loan counts for different states | |
state_count = a_lc.groupby(['addr_state']).count().reset_index() | |
state_count = state_count[['addr_state','loan_class']] | |
state_count.rename(columns={'loan_class': 'counts'}) | |
unique_count = a_lc.groupby(['addr_state','loan_class']).count().reset_index() | |
unique_count = unique_count[['addr_state','loan_class', 'id']] | |
m = pd.merge(unique_count, state_count, on="addr_state", how = "left") | |
m.rename(columns={'loan_class_y': 'total'}) | |
plt.figure(figsize=(20,6)) | |
sns.barplot(x="addr_state", y="ratio", | |
hue="loan_class_x", palette="rocket", linewidth=2.5, | |
data=m) #drop state | |
#Encoding | |
print([column for column in a_lc.columns if a_lc[column].dtype == 'object']) | |
a_lc.drop(['id','addr_state'], axis =1, inplace=True) | |
sub_grades = sorted(a_lc.sub_grade.unique()) | |
grades = dict((i,j) for i,j in enumerate(sub_grades, start=1)) | |
grades = {grades[k]:k for k in grades} | |
a_lc_wo = a_lc.copy() | |
a_lc['sub_grade'] = a_lc['sub_grade'].map(grades) | |
verification_status = {'Not Verified':0, 'Source Verified':1, 'Verified':2} | |
a_lc['verification_status'] = a_lc['verification_status'].map(verification_status) | |
# Dummy variable created for the following features | |
categorical = ['home_ownership', 'purpose'] | |
a_lc =pd.get_dummies(a_lc, columns=categorical, drop_first=True) | |
print([column for column in a_lc.columns if a_lc[column].dtype == 'object']) | |
a_lc[['term', 'emp_length']] = a_lc[['term', 'emp_length']].apply(pd.to_numeric) | |
#Feature Engineering | |
a_lc['term_amnt'] = a_lc['loan_amnt']*a_lc['term'] #in order to calculate weighted average. longer term => takes longer to pay | |
weighted_average = a_lc['term_amnt'].sum()/a_lc['loan_amnt'].sum() | |
print(f'Average term: {round(a_lc.term.mean(),2)} months') | |
print(f'Weighted average term amount: {str(round(weighted_average,2))} months') | |
a_lc['int_amnt'] = a_lc['int_rate']*a_lc['loan_amnt'] | |
cumulative_avg_int_rate = a_lc['int_amnt'].sum()/a_lc['loan_amnt'].sum() | |
print(f'Average interest rate: {str(round(a_lc.int_rate.mean(),2))}%') | |
print(f'Weighted av. rate: {str(round(cumulative_avg_int_rate,2))}%') | |
a_lc.drop('loan_category', axis=1,inplace=True) | |
a_lc['loan_class'].value_counts() | |
# Countplot of Good Loans and Bad Loans | |
g= sns.countplot(a_lc["loan_class"], palette="rocket") | |
g.set_xticklabels(g.get_xticklabels(),rotation=0) | |
g.set_title("Good Loan and Bad Loan", fontsize=20) | |
g.set_xlabel("Loan Class", fontsize=15) | |
g.set_ylabel("Frequency", fontsize=15) | |
#Create pickle for cleaned data | |
loans = a_lc.copy() | |
import pickle | |
with open('cleaned_data.pkl', 'wb') as pickle_file: | |
pickle.dump(loans, pickle_file) | |
#split loans into train test groups based on year. Define X_train/test, Y_train/test | |
loans_train = loans[loans['year'] < 2018] | |
loans_test = loans[loans['year'] == 2018] | |
X_train = loans_train.loc[:, loans_train.columns != "loan_class"] | |
Y_train = loans_train["loan_class"] | |
X_test = loans_test.loc[:, loans_test.columns != "loan_class"] | |
Y_test = loans_test["loan_class"] | |
X_train.drop('year', axis =1) | |
X_test.drop('year', axis =1) | |
X_train.shape, X_test.shape | |
with open('loans_test.pkl', 'wb') as pickle_file: | |
pickle.dump(loans_test, pickle_file) | |
# Performance Metrics Calculation | |
# Assuming cost of funding is 3% | |
CoF =3 | |
#gives back necessary financial metrics of loan data for comparison | |
def return_stats(df, title): | |
""" | |
Function to return all of the financial returns data for comparing the models | |
returns: the total book return on equity in % | |
""" | |
weighted_term = df['term_amnt'].sum() / df['loan_amnt'].sum() | |
weighted_interest = df['int_amnt'].sum() / df['loan_amnt'].sum() | |
charge_off_rate = df['loan_class'].value_counts(normalize=True) | |
charge_off_amount = df.groupby('loan_class').loan_amnt.sum() #dollar amount of each loan_class | |
total_CoR = charge_off_amount[1] / df['loan_amnt'].sum() #dollar amount of default loans | |
loans_approved = df.loan_amnt.sum() #total dollar amount of loans | |
loans_charged_off = charge_off_amount[1] #total dollar amount of default loans | |
# And therefore, the ROI: | |
loans_returned = loans_approved - loans_charged_off #total loan amount paid back | |
interest_earned = loans_returned * (weighted_interest) / 100 * (weighted_term / 12) #interest amount per year | |
interest_paid = loans_approved * CoF / 100 * (weighted_term / 12) #CoF (cost of funds) | |
profit = interest_earned - interest_paid - loans_charged_off | |
ROI = profit / loans_approved | |
print(title) | |
print(f'Weighted average term amount: {str(round(weighted_term,2))} months') | |
print(f'Weighted av. rate: {str(round(weighted_interest,2))}%') | |
print(f'Total cost of risk due to default: {str(round(total_CoR * 100, 2))}%') | |
print(f'Total loan book is: $ {str(round(loans_approved / 1e6))} M') | |
print(f'Interest earned is: $ {str(round(interest_earned / 1e6, 0))} M') | |
print(f'Interest paid is: $ {str(round(interest_paid / 1e6, 0))} M') | |
print(f'Profit is: $ {str(round(profit / 1e6, 0))} M') | |
print(f'Total book ROI: {str(round(ROI * 100, 2))} %') | |
return weighted_term, weighted_interest, charge_off_rate, charge_off_amount, total_CoR, ROI | |
# High level Performance metrics of all Loans | |
weighted_term, weighted_interest, charge_off_rate, charge_off_amount, total_CoR, ROI = return_stats(loans, 'Total book') | |
# High level Performance metrics of train data | |
weighted_term, weighted_interest, charge_off_rate, charge_off_amount, total_CoR, ROI = return_stats(loans_train, 'Total book') | |
# High level Performance metrics of test data | |
weighted_term_test, weighted_interest_test, charge_off_rate_test, charge_off_amount_test, total_CoR_test, ROI_test = return_stats(loans_test, 'Test set loans') | |
def model_performance(y_pred, column, total_CoR_test, df, CoF): | |
""" | |
Compares model prediction metrics against the metrics of the test set loans as well as CoR | |
y_pred - the model prediction of loan class | |
Returns interest rate, cost of risk and ROI of the book in % | |
""" | |
# Adding new column with the predicted charge off rate | |
df[column] = y_pred | |
y_test_paid = df.loc[df[column].isin([0])] | |
performance = y_test_paid.groupby('loan_class').loan_amnt.sum() | |
# CoR is the proportion that was charged off: | |
loans_approved = performance.sum() | |
loans_charged_off = performance[1] | |
CoR = loans_charged_off / loans_approved | |
saving = df['loan_amnt'].sum() * (total_CoR_test - CoR) | |
weighted_interest = y_test_paid['int_amnt'].sum() / y_test_paid['loan_amnt'].sum() | |
weighted_term = y_test_paid['term_amnt'].sum() / y_test_paid['loan_amnt'].sum() | |
# ROI | |
loans_returned = loans_approved - loans_charged_off | |
interest_earned = loans_returned * weighted_interest / 100 * (weighted_term / 12) | |
interest_paid = loans_approved * CoF / 100 * (weighted_term / 12) | |
profit = interest_earned - interest_paid - loans_charged_off | |
ROI = profit / loans_approved | |
print('Total loans approved = $' + str(round(loans_approved / 1e9, 3)) + 'bn') | |
print('Total loans charged off = $' + str(round(loans_charged_off / 1e9, 3)) + 'bn') | |
print('Modelled cost of risk is: ' + str(round(CoR * 100, 2)) + "%") | |
print("Total saving = $" + str(round(saving / 1e6, 1)) + "m\n") | |
print('Weighted average interest rate: ' + str(round(weighted_interest, 2)) + "%") | |
print('Weighted average term: ' + str(round(weighted_term, 2)) + " months\n") | |
print('Interest earned: $' + str(round(interest_earned / 1e6, 0)) + "m") | |
print('Interest paid: $' + str(round(interest_paid / 1e6, 0)) + "m") | |
print('Profit is: $' + str(round(profit / 1e6, 0)) + "m") | |
print('Return on investment is: ' + str(round(ROI * 100, 2)) + "%") | |
return weighted_interest, CoR, ROI | |
# Machine Learning - No tuning | |
# scaling the dataset | |
from sklearn.preprocessing import StandardScaler | |
sc = StandardScaler() | |
X_train = sc.fit_transform(X_train) | |
X_test=sc.transform(X_test) | |
# Applying Logistic Regression without class weight/sampling | |
logistic= LogisticRegression() | |
temp=logistic.fit(X_train, Y_train) | |
Y_pred= logistic.predict(X_test) | |
# Summary of the prediction | |
print(classification_report(Y_test, Y_pred)) | |
#print(confusion_matrix(Y_test, Y_pred)) | |
conf_matrix = confusion_matrix(Y_test, Y_pred) | |
# Accuracy | |
print("Accuracy of the model is: ", accuracy_score(Y_pred,Y_test)) | |
ax = sns.heatmap(conf_matrix, cmap='viridis_r', annot=True, fmt='d', square=True) | |
ax.set_title('Logistic regression performance') | |
ax.set_xlabel('Predicted') | |
ax.set_ylabel('True') | |
fpr, tpr, thresholds = roc_curve(Y_test, Y_pred) | |
roc_auc = auc(fpr,tpr) | |
# Plot ROC | |
plt.title('Receiver Operating Characteristic') | |
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc) | |
plt.legend(loc='lower right') | |
plt.plot([0,1],[0,1],'r--') | |
plt.xlim([-0.1,1.0]) | |
plt.ylim([-0.1,1.01]) | |
plt.ylabel('True Positive Rate') | |
plt.xlabel('False Positive Rate') | |
plt.show() | |
# Apply performance metrics function | |
logistic_int_rate, logistic_CoR, logistic_ROI = model_performance(Y_pred,'logistic model (no tuning)', total_CoR_test, loans_test, CoF) | |
# Apply XGBoosting model on train data | |
import xgboost as xgb | |
xg = xgb.XGBClassifier() | |
n_estimators = 100 | |
xg.set_params(max_depth=4, n_estimators=n_estimators) | |
train_r2 = [] | |
steps = range(100,1000,10) | |
xg.fit(X_train,Y_train) | |
Y_pred = xg.predict(X_test) | |
# Summary of the prediction | |
print(classification_report(Y_test, Y_pred)) | |
#print(confusion_matrix(Y_test, Y_pred)) | |
conf_matrix = confusion_matrix(Y_test, Y_pred) | |
# Accuracy | |
print("Accuracy of the model is: ", accuracy_score(Y_pred,Y_test)) | |
ax = sns.heatmap(conf_matrix, cmap='viridis_r', annot=True, fmt='d', square=True) | |
ax.set_title('XGBoost performance') | |
ax.set_xlabel('Predicted') | |
ax.set_ylabel('True') | |
with open('y_pred_xg.pkl', 'wb') as pickle_file: | |
pickle.dump(Y_pred, pickle_file) | |
# ROC-AUC for XGBoost | |
fpr, tpr, thresholds = roc_curve(Y_test, Y_pred) | |
roc_auc = auc(fpr,tpr) | |
# Plot ROC | |
plt.title('Receiver Operating Characteristic') | |
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc) | |
plt.legend(loc='lower right') | |
plt.plot([0,1],[0,1],'r--') | |
plt.xlim([-0.1,1.0]) | |
plt.ylim([-0.1,1.01]) | |
plt.ylabel('True Positive Rate') | |
plt.xlabel('False Positive Rate') | |
plt.show() | |
# Apply performance metrics function on XGBoost | |
xg_int_rate, xg_CoR, xg_ROI = model_performance(Y_pred,'XGBoost (no tuning)', total_CoR_test, loans_test, CoF) | |
#Apply Catboost algorithm | |
from catboost import CatBoostClassifier | |
clf = CatBoostClassifier( | |
learning_rate=0.1, | |
#loss_function='CrossEntropy' | |
) | |
clf.fit(X_train,Y_train) | |
Y_pred = clf.predict(X_test) | |
# Summary of the prediction | |
print(classification_report(Y_test, Y_pred)) | |
#print(confusion_matrix(Y_test, Y_pred)) | |
conf_matrix = confusion_matrix(Y_test, Y_pred) | |
# Accuracy | |
print("Accuracy of the model is: ", accuracy_score(Y_pred,Y_test)) | |
ax = sns.heatmap(conf_matrix, cmap='viridis_r', annot=True, fmt='d', square=True) | |
ax.set_title('Catboost performance') | |
ax.set_xlabel('Predicted') | |
ax.set_ylabel('True') | |
# ROC-AUC for CatBoost | |
fpr, tpr, thresholds = roc_curve(Y_test, Y_pred) | |
roc_auc = auc(fpr,tpr) | |
# Plot ROC | |
plt.title('Receiver Operating Characteristic') | |
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc) | |
plt.legend(loc='lower right') | |
plt.plot([0,1],[0,1],'r--') | |
plt.xlim([-0.1,1.0]) | |
plt.ylim([-0.1,1.01]) | |
plt.ylabel('True Positive Rate') | |
plt.xlabel('False Positive Rate') | |
# Apply performance metrics function on CatBoost | |
clf_int_rate, clf_CoR, clf_ROI = model_performance(Y_pred,'CatBoost', total_CoR_test, loans_test, CoF) | |
from sklearn.ensemble import RandomForestClassifier | |
rf = RandomForestClassifier(n_estimators = 500, criterion = 'gini') | |
rf.fit(X_train, Y_train) | |
Y_Pred= rf.predict(X_test) | |
# rf_probs = rf.predict_proba(Y_test)[:, 1] | |
print(classification_report(Y_test, Y_Pred)) | |
#print(confusion_matrix(Y_test, rf_predictions)) | |
conf_matrix = confusion_matrix(Y_test, Y_Pred) | |
# Accuracy | |
print("Accuracy of the model is: ", accuracy_score(Y_Pred,Y_test)) | |
# print("Precision score: ", precision_score(Y_test,Y_Pred)) | |
# print("Recall score: {}", recall_score(Y_test,Y_Pred)) | |
# roc_auc_score(Y_test, rf_probs) | |
ax = sns.heatmap(conf_matrix, cmap='viridis_r', annot=True, fmt='d', square=True) | |
ax.set_title('RandomForest performance') | |
ax.set_xlabel('Predicted') | |
ax.set_ylabel('True') | |
# Random Forest | |
Y_Pred= rf.predict(X_test) | |
print(classification_report(Y_test, Y_Pred)) | |
#print(confusion_matrix(Y_test, rf_predictions)) | |
conf_matrix = confusion_matrix(Y_test, Y_Pred) | |
# Accuracy | |
print("Accuracy of the model is: ", accuracy_score(Y_Pred,Y_test)) | |
#print("Precision score: {}".format(precision_score(Y_test,Y_Pred))) | |
print("Recall score: {}".format(recall_score(Y_test,Y_Pred))) | |
# roc_auc_score(Y_test, rf_probs) | |
ax = sns.heatmap(conf_matrix, cmap='viridis_r', annot=True, fmt='d', square=True) | |
ax.set_title('RandomForest performance') | |
ax.set_xlabel('Predicted') | |
ax.set_ylabel('True') | |
# ROC-AUC for Random Forest | |
fpr, tpr, thresholds = roc_curve(Y_test, Y_pred) | |
roc_auc = auc(fpr,tpr) | |
# Plot ROC | |
plt.title('Receiver Operating Characteristic') | |
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc) | |
plt.legend(loc='lower right') | |
plt.plot([0,1],[0,1],'r--') | |
plt.xlim([-0.1,1.0]) | |
plt.ylim([-0.1,1.01]) | |
plt.ylabel('True Positive Rate') | |
plt.xlabel('False Positive Rate') | |
plt.show() | |
# Apply performance metrics function on RandomForest | |
rf_int_rate, rf_CoR, rf_ROI = model_performance(Y_pred,'R Forest', total_CoR_test, loans_test, CoF) | |
# Naive Bayes | |
naive= GaussianNB() | |
naive.fit(X_train, Y_train) | |
Y_pred= naive.predict(X_test) | |
# Summary of prediction | |
print(classification_report(Y_test, Y_pred)) | |
#print(confusion_matrix(Y_test, Y_pred)) | |
conf_matrix = confusion_matrix(Y_test, Y_Pred) | |
# Accuracy score | |
print("Accuracy of the model is: ", accuracy_score(Y_pred,Y_test)) | |
ax = sns.heatmap(conf_matrix, cmap='viridis_r', annot=True, fmt='d', square=True) | |
ax.set_title('RandomForest performance') | |
ax.set_xlabel('Predicted') | |
ax.set_ylabel('True') | |
# ROC-AUC for Naive Bayes | |
fpr, tpr, thresholds = roc_curve(Y_test, Y_pred) | |
roc_auc = auc(fpr,tpr) | |
# Plot ROC | |
plt.title('Receiver Operating Characteristic') | |
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc) | |
plt.legend(loc='lower right') | |
plt.plot([0,1],[0,1],'r--') | |
plt.xlim([-0.1,1.0]) | |
plt.ylim([-0.1,1.01]) | |
plt.ylabel('True Positive Rate') | |
plt.xlabel('False Positive Rate') | |
plt.show() | |
# Apply performance metrics function on Naive Bayes | |
naive_int_rate, naive_CoR, naive_ROI = model_performance(Y_pred,'Naive Bayes', total_CoR_test, loans_test, CoF) | |
from sklearn.metrics import precision_recall_curve | |
def threshold_calculation(model, threshold_list): | |
# pred_proba_df = pd.DataFrame(model.predict_proba(X_test)) | |
for i in threshold_list: | |
print ('\n******** For i = {} ******'.format(i)) | |
Y_test_pred = (model.predict_proba(X_test)[:,1]>i).astype("int32") | |
test_accuracy = accuracy_score(Y_test, Y_test_pred) | |
print('Our testing accuracy is {}'.format(test_accuracy)) | |
print(confusion_matrix(Y_test, Y_test_pred)) | |
#plot precision/recall curve against different thresholds | |
pred_y = model.predict(X_test) | |
probs_y = model.predict_proba(X_test) # probs_y is probability of being labeled as 0 (first column of array) vs 1 (2nd column in array) | |
precision, recall, thresholds = precision_recall_curve(Y_test, probs_y[:, 1]) | |
#retrieve probability of being 1(in second column of probs_y) | |
pr_auc = auc(recall, precision) | |
plt.subplot(1, 2, 1) | |
plt.title("Precision-Recall vs Threshold Chart") | |
plt.plot(thresholds, precision[: -1], "b--", label="Precision") | |
plt.plot(thresholds, recall[: -1], "r--", label="Recall") | |
plt.ylabel("Precision, Recall") | |
plt.xlabel("Threshold") | |
plt.legend(loc="lower left") | |
plt.ylim([0,1]) | |
plt.subplot(1, 2, 2) | |
no_skill = len(Y_test[Y_test==1]) / len(Y_test) | |
plt.title("Precision-Recall") | |
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill') | |
plt.plot(recall, precision, marker='.', label='Logistic') | |
plt.xlabel('Recall') | |
plt.ylabel('Precision') | |
plt.legend() | |
#Imbalanced Data - Tuning with Class Weight and Undersampling | |
# Exploring best class weight for logistic regression | |
from sklearn.model_selection import GridSearchCV,StratifiedKFold | |
import collections, numpy | |
lr = LogisticRegression() | |
# n = collections.Counter(Y_train) | |
# class_weights = {0:1 - n[0]/(n[0]+n[1]), 1:1 - n[1]/(n[0]+n[1])} | |
# penalty = ['l1', 'l2'] | |
# C = np.logspace(0, 4, 10) | |
# param_grid = dict(C=C, penalty=penalty) | |
#Setting the range for class weights | |
weights = np.linspace(0.0,0.5) | |
#Creating a dictionary grid for grid search | |
param_grid = {'class_weight': [{0:x, 1:1.0-x} for x in weights]} | |
#Fitting grid search to the train data with 5 folds | |
gridsearch = GridSearchCV(estimator= lr, | |
param_grid= param_grid, | |
cv=StratifiedKFold(), | |
n_jobs=-1, | |
scoring='f1', | |
verbose=2).fit(X_train, Y_train) | |
# logistic_cw = LogisticRegression(class_weight = class_weights, max_iter=1000) | |
# logistic_cw_grid = GridSearchCV(logistic_cw, param_grid, cv=5, scoring='roc_auc', verbose=10, n_jobs=-1) | |
# logistic_cw_grid.fit(X_train, Y_train) | |
#Ploting the score for different values of weight | |
sns.set_style('whitegrid') | |
plt.figure(figsize=(12,8)) | |
weigh_data = pd.DataFrame({ 'score': gridsearch.cv_results_['mean_test_score'], 'weight': (1- weights)}) | |
sns.lineplot(weigh_data['weight'], weigh_data['score']) | |
plt.xlabel('Weight for class 1') | |
plt.ylabel('F1 score') | |
plt.xticks([round(i/10,1) for i in range(0,11,1)]) | |
plt.title('Scoring for different class weights', fontsize=24) | |
#Ploting the score for different values of weight | |
sns.set_style('whitegrid') | |
plt.figure(figsize=(12,8)) | |
weigh_data = pd.DataFrame({ 'score': gridsearch.cv_results_['mean_test_score'], 'weight': (1- weights)}) | |
sns.lineplot(weigh_data['weight'], weigh_data['score']) | |
plt.xlabel('Weight for class 1') | |
plt.ylabel('F1 score') | |
plt.xticks([round(i/10,1) for i in range(0,11,1)]) | |
plt.title('Scoring for different class weights', fontsize=24) | |
# Exploring class-weight - Applying Logistic Regression with class_weight | |
n = Y_train.value_counts() | |
#class_weights = {0:1- n[0]/n.sum(), 1:1- n[1]/n.sum()} | |
# class_weights | |
# Applying Logistic Regression on class weights (final ratio) | |
from sklearn.metrics import precision_score | |
logistic_cw = LogisticRegression(class_weight = {0: 0.2040816326530612, 1: 0.7959183673469388}) | |
logistic_cw.fit(X_train, Y_train) | |
Y_pred_cw = logistic_cw.predict(X_test) | |
# Summary of the prediction | |
print(classification_report(Y_test, Y_pred_cw)) | |
print(confusion_matrix(Y_test, Y_pred_cw)) | |
# Accuracy | |
print("Accuracy of the model is: ", accuracy_score(Y_pred_cw,Y_test)) | |
print("Precision score: {}".format(precision_score(Y_test,Y_pred_cw))) | |
print("Recall score: {}".format(recall_score(Y_test,Y_pred_cw))) | |
conf_matrix = confusion_matrix(Y_test, Y_pred_cw) | |
roc_auc_score(Y_test, logistic_cw.predict_proba(X_test)[:,1]) | |
# ROC-AUC for Logistic regression + Class Weight | |
fpr, tpr, thresholds = roc_curve(Y_test, Y_pred_cw) | |
roc_auc = auc(fpr,tpr) | |
# Plot ROC | |
plt.title('Receiver Operating Characteristic') | |
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc) | |
plt.legend(loc='lower right') | |
plt.plot([0,1],[0,1],'r--') | |
plt.xlim([-0.1,1.0]) | |
plt.ylim([-0.1,1.01]) | |
plt.ylabel('True Positive Rate') | |
plt.xlabel('False Positive Rate') | |
plt.show() | |
# Performance metrics for Logistic regression + Class Weight | |
logistic_cw_int_rate, logistic_cw_CoR, logistic_cw_RoE = model_performance(Y_pred_cw,'logistic model (class weights)', total_CoR_test, loans_test, CoF) | |
# Apply XGBoosting model on train data with scale_pos_weight | |
import xgboost as xgb | |
xg = xgb.XGBClassifier() | |
n_estimators = 100 | |
xg.set_params(max_depth=4, n_estimators=n_estimators,scale_pos_weight=10) | |
train_r2 = [] | |
steps = range(100,1000,10) | |
xg.fit(X_train,Y_train) | |
Y_pred = xg.predict(X_test) | |
# Summary of the prediction | |
print(classification_report(Y_test, Y_pred)) | |
#print(confusion_matrix(Y_test, Y_pred)) | |
conf_matrix = confusion_matrix(Y_test, Y_pred) | |
# Accuracy | |
print("Accuracy of the model is: ", accuracy_score(Y_pred,Y_test)) | |
ax = sns.heatmap(conf_matrix, cmap='viridis_r', annot=True, fmt='d', square=True) | |
ax.set_title('XGBoost performance') | |
ax.set_xlabel('Predicted') | |
ax.set_ylabel('True') | |
plt.show() | |
# ROC-AUC for XGBoost - Tuned | |
fpr, tpr, thresholds = roc_curve(Y_test, Y_pred) | |
roc_auc = auc(fpr,tpr) | |
# Plot ROC | |
plt.title('Receiver Operating Characteristic') | |
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc) | |
plt.legend(loc='lower right') | |
plt.plot([0,1],[0,1],'r--') | |
plt.xlim([-0.1,1.0]) | |
plt.ylim([-0.1,1.01]) | |
plt.ylabel('True Positive Rate') | |
plt.xlabel('False Positive Rate') | |
plt.show() | |
xg_cw_int_rate, xg_cw_CoR, xg_cw_RoE = model_performance(Y_pred_cw,'XGBoost (class weights)', total_CoR_test, loans_test, CoF) | |
#Neural Net | |
# Apply Neural Network algorithm | |
import tensorflow.python.keras | |
from tensorflow.python.keras.models import Sequential | |
from tensorflow.python.keras.metrics import categorical_crossentropy | |
from tensorflow.python.keras.layers import Dense, Activation,Dropout | |
#from tensorflow.python.keras.optimizers import SGD | |
input_nodes = X_train.shape[1] | |
#output_nodes = 1 | |
model = Sequential() | |
#model.add(Input((input_nodes,))) | |
model.add(Dense(units=16,activation="sigmoid")) | |
model.add(Dense(units=32, activation="sigmoid")) | |
model.add(Dense(units=2, activation="softmax")) | |
#model.add(Dense(output_nodes)) | |
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=['accuracy']) | |
model.fit(X_train, Y_train, batch_size=20, validation_split=0.1, epochs=50,verbose=2) | |
Y_pred = model.predict(X_test) | |
accuracy = accuracy_score(Y_test, np.argmax(Y_pred, axis=1)) | |
print('Accuracy: {0:.2f}'.format(accuracy * 100.0)) | |
print('Classification Report:') | |
print(classification_report(Y_test, np.argmax(Y_pred, axis=1))) | |
print('Confusion Matrix:') | |
#print(confusion_matrix(Y_test, np.argmax(Y_pred, axis=1))) | |
conf_matrix = confusion_matrix(Y_test, np.argmax(Y_pred, axis=1)) | |
# Accuracy | |
#print("Accuracy of the model is: ", accuracy_score(Y_pred_us,Y_test)) | |
ax = sns.heatmap(conf_matrix, cmap='viridis_r', annot=True, fmt='d', square=True) | |
ax.set_title('Neural Net performance') | |
ax.set_xlabel('Predicted') | |
ax.set_ylabel('True') | |
# ROC-AUC for Neural Net | |
fpr, tpr, thresholds = roc_curve(Y_test, np.argmax(Y_pred, axis=1)) | |
roc_auc = auc(fpr,tpr) | |
# Plot ROC | |
plt.title('Receiver Operating Characteristic') | |
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc) | |
plt.legend(loc='lower right') | |
plt.plot([0,1],[0,1],'r--') | |
plt.xlim([-0.1,1.0]) | |
plt.ylim([-0.1,1.01]) | |
plt.ylabel('True Positive Rate') | |
plt.xlabel('False Positive Rate') | |
plt.show() | |
nn_int_rate, nn_us_CoR, nn_us_RoE = model_performance(np.argmax(Y_pred, axis=1),'Neural Net', total_CoR_test, loans_test, CoF) | |
#Simulation | |
#gives back necessary financial metrics of loan data for comparison | |
def ROI_total(df): | |
""" | |
Function to return all of the financial returns data for comparing the models | |
returns: the total book return on equity in % | |
""" | |
weighted_term = df['term_amnt'].sum() / df['loan_amnt'].sum() | |
weighted_interest = df['int_amnt'].sum() / df['loan_amnt'].sum() | |
charge_off_rate = df['loan_class'].value_counts(normalize=True) | |
charge_off_amount = df.groupby('loan_class').loan_amnt.sum() #dollar amount of each loan_class | |
total_CoR = charge_off_amount[1] / df['loan_amnt'].sum() #dollar amount of default loans | |
loans_approved = df.loan_amnt.sum() #total dollar amount of loans | |
loans_charged_off = charge_off_amount[1] #total dollar amount of default loans | |
# And therefore, the RoE: | |
loans_returned = loans_approved - loans_charged_off #total loan amount paid back | |
interest_earned = loans_returned * (weighted_interest) / 100 * (weighted_term / 12) #interest amount per year | |
interest_paid = loans_approved * CoF / 100 * (weighted_term / 12) #CoF (cost of funds) | |
profit = interest_earned - interest_paid - loans_charged_off | |
ROI = profit / loans_approved | |
return ROI | |
from sklearn.utils import shuffle | |
def ROI_model(y_pred, column, n): | |
""" | |
Compares model prediction metrics against the metrics of the test set loans as well as CoR | |
y_pred - the model prediction of loan class | |
Returns interest rate, cost of risk and ROI of the book in % | |
""" | |
df = loans_test | |
COF = 3 | |
a, b, c, d, e, f, g = n[0],n[1],n[2],n[3],n[4],n[5],n[6] | |
# Adding new column with the predicted charge off rate | |
df[column] = y_pred | |
y_test_paid = df.loc[df[column].isin([0])] | |
a_loans = shuffle(y_test_paid.loc[(y_test_paid['sub_grade'] >= 1) & (y_test_paid['sub_grade'] <= 5)])[0:a] | |
b_loans = shuffle(y_test_paid.loc[(y_test_paid['sub_grade'] >= 6) & (y_test_paid['sub_grade'] <= 10)])[0:b] | |
c_loans = shuffle(y_test_paid.loc[(y_test_paid['sub_grade'] >= 11) & (y_test_paid['sub_grade'] <= 15)])[0:c] | |
d_loans = shuffle(y_test_paid.loc[(y_test_paid['sub_grade'] >= 16) & (y_test_paid['sub_grade'] <= 20)])[0:d] | |
e_loans = shuffle(y_test_paid.loc[(y_test_paid['sub_grade'] >= 21) & (y_test_paid['sub_grade'] <= 25)])[0:e] | |
f_loans = shuffle(y_test_paid.loc[(y_test_paid['sub_grade'] >= 26) & (y_test_paid['sub_grade'] <= 30)])[0:f] | |
g_loans = shuffle(y_test_paid.loc[(y_test_paid['sub_grade'] >= 31) & (y_test_paid['sub_grade'] <= 35)])[0:g] | |
y_test_paid = pd.concat([a_loans, b_loans, c_loans, d_loans, e_loans, f_loans, g_loans]) | |
performance = y_test_paid.groupby('loan_class').loan_amnt.sum() | |
# CoR is the proportion that was charged off: | |
loans_approved = performance.sum() | |
loans_charged_off = performance[1] | |
CoR = loans_charged_off / loans_approved | |
saving = df['loan_amnt'].sum() * (total_CoR_test - CoR) | |
weighted_interest = y_test_paid['int_amnt'].sum() / y_test_paid['loan_amnt'].sum() | |
weighted_term = y_test_paid['term_amnt'].sum() / y_test_paid['loan_amnt'].sum() | |
# ROI | |
loans_returned = loans_approved - loans_charged_off | |
interest_earned = loans_returned * weighted_interest / 100 * (weighted_term / 12) | |
interest_paid = loans_approved * CoF / 100 * (weighted_term / 12) | |
profit = interest_earned - interest_paid - loans_charged_off | |
ROI = (profit / loans_approved) | |
return ROI | |
ROI_model(y_pred,'logistic model (class weights)', n) | |
# Model ROI calculation on shuffled loans | |
model_ROI = [] | |
lc_ROI = [] | |
for i in range(10): | |
loan_shuffled = shuffle(loans) | |
l_ROI = ROI_total(loan_shuffled[0:100]) | |
lc_ROI.append(l_ROI) | |
y_pred = modeling(X_train, Y_train) | |
m_ROI = ROI_model(y_pred,'logistic model (class weights)', n) | |
model_ROI.append(m_ROI) | |
ROI_df = pd.DataFrame() | |
ROI_df['model_ROI'] = model_ROI | |
ROI_df['lc_ROI'] = lc_ROI | |
# Plot histogram of of benchmark and Model ROI | |
ROI_df = pd.DataFrame() | |
ROI_df['model_ROI'] = model_ROI | |
ROI_df['lc_ROI'] = lc_ROI | |
fig, ax = plt.subplots(figsize=(14,7)) | |
ax = sns.distplot(tuple(ROI_df['model_ROI'])) | |
ax = sns.distplot(tuple(ROI_df['lc_ROI'])) | |
ax.set_xlabel("Rate of Return",fontsize=16) | |
ax.set_ylabel("Frequency",fontsize=16) | |
plt.legend(labels=["Model ROI", 'Benchmark (Lending Club) ROI'], fontsize=16) | |
plt.savefig(fname='ROI_Hist', dpi=150) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment