Chaitali20-gh/LendingClub

## LendingClub
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc, roc_auc_score, roc_curve, recall_score, classification_report
import pickle
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 500)

# read approved loan data from corresponding excels
accepted = pd.read_csv('accepted_2007_to_2018Q4.csv.gz', compression='gzip', low_memory=False)
a_lc = accepted.copy()

# read rejected loan data from corresponding excels
r_lc = pd.read_csv("rejected_2007_to_2018Q4.csv.gz", low_memory=False)

#check the approved loan data
a_lc.head()
# Check the size
a_lc.shape

# remove not needed columns:
a_lc.drop(['all_util', 'bc_util','acc_open_past_24mths','url','revol_bal_joint','sec_app_fico_range_low','sec_app_fico_range_high',
           'sec_app_earliest_cr_line','sec_app_inq_last_6mths','sec_app_mort_acc','sec_app_open_acc','sec_app_revol_util',
           'sec_app_open_act_il','sec_app_num_rev_accts','sec_app_chargeoff_within_12_mths','sec_app_collections_12_mths_ex_med',
           'sec_app_mths_since_last_major_derog','hardship_type','hardship_reason','hardship_status','deferral_term','deferral_term',
           'hardship_amount','hardship_start_date','hardship_end_date','payment_plan_start_date','hardship_length',
           'hardship_dpd','hardship_loan_status','orig_projected_additional_accrued_interest','hardship_payoff_balance_amount',
           'hardship_last_payment_amount','debt_settlement_flag_date','settlement_status','settlement_date','settlement_amount',
           'settlement_percentage','settlement_term','funded_amnt','funded_amnt_inv','pymnt_plan','pymnt_plan','hardship_flag',
           'total_pymnt','total_pymnt_inv','total_rec_int','total_rec_late_fee','total_rec_prncp','out_prncp',
           'out_prncp_inv','recoveries','collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'last_credit_pull_d',
           'last_fico_range_high', 'last_fico_range_low', 'collections_12_mths_ex_med', 'policy_code', 'acc_now_delinq',
           'total_rev_hi_lim', 'acc_open_past_24mths','bc_open_to_buy', 'chargeoff_within_12_mths', 'delinq_amnt',
           'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op','mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mths_since_recent_bc', 'mths_since_recent_inq',
           'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl',
           'num_op_rev_tl','num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd',
           'num_tl_90g_dpd_24m', 'num_tl_op_past_12m', 'pct_tl_nvr_dlq','tax_liens', 'tot_hi_cred_lim',
           'total_bc_limit', 'total_il_high_credit_limit','disbursement_method',
           'debt_settlement_flag','verification_status_joint','dti_joint','annual_inc_joint',
          'mths_since_recent_bc_dlq','mths_since_recent_revol_delinq',
          'next_pymnt_d','mths_since_last_delinq','il_util','mths_since_rcnt_il',
          'open_acc_6m','total_cu_tl','inq_last_12m','open_il_24m','open_act_il','total_bal_il','open_rv_12m',
          'open_rv_24m','open_il_12m','emp_title','max_bal_bc','desc','member_id',
          'num_rev_accts', 'initial_list_status', 'application_type', 'percent_bc_gt_75'], axis=1,inplace=True)

# Check missing values count and percent
missing= a_lc.isnull().sum().sort_values(ascending=False)
percent= (a_lc.isnull().sum()/a_lc.isnull().count()).sort_values(ascending=False)*100
missing_data= pd.concat([missing, percent],axis=1, keys=["Total", "Percent"])
missing_data.head(100)

# Removing the 33 rows where home ownership data was missing
a_lc = a_lc[-a_lc.home_ownership.isna()]

#drop significantly missing values
a_lc.drop(['mths_since_last_record','mths_since_last_major_derog'], axis=1, inplace=True)

# Checking the dataset for the loan data delinquest over 2 years
a_lc.loc[a_lc.delinq_2yrs.isnull() == True]

# Checking the dataset how many records are having total account blank
a_lc.loc[a_lc.total_acc.isnull() == True]

# Missing value imputation
# Filling the mode in place of the missing values - categorical features

num = ['inq_fi','emp_length','mort_acc','title','pub_rec_bankruptcies','zip_code','int_rate','open_acc','verification_status',
      'term','grade','sub_grade','home_ownership','loan_status','purpose','addr_state','issue_d','earliest_cr_line']
for i in num:
    a_lc[i]=a_lc[i].fillna(a_lc[i].mode()[0])

# Missing value imputation
# Filling the mode in place of the missing values - numerical features

num = ['avg_cur_bal','tot_cur_bal','total_bal_ex_mort','dti','revol_util','total_acc','annual_inc','inq_last_6mths','delinq_2yrs',
      'open_acc','pub_rec','tot_coll_amt','loan_amnt','fico_range_low','fico_range_high']
for i in num:
    a_lc[i]=a_lc[i].fillna(a_lc[i].median())

# Re-Check missing values count and percent - No more missing data
missing= a_lc.isnull().sum().sort_values(ascending=False)
percent= (a_lc.isnull().sum()/a_lc.isnull().count()).sort_values(ascending=False)*100
missing_data= pd.concat([missing, percent],axis=1, keys=["Total", "Percent"])
missing_data

# Loan counts group by Loan Status
ls = a_lc.groupby('loan_status').count()['loan_amnt']
ls

# Loan Status
plt.figure(figsize = (8,4))
g = sns.countplot(x="loan_status",data=a_lc,
                  palette="rocket")
g.set_xticklabels(g.get_xticklabels(),rotation=75)
g.set_title("Loan Status", fontsize=20)
g.set_xlabel("Loan Status", fontsize=15)
g.set_ylabel("Loan Amount", fontsize=20)

# check the current loans
current = a_lc.loc[a_lc.loan_status == 'Current']

# Excluding the current loans from dataset
a_lc = a_lc[a_lc.loan_status != 'Current']
a_lc.shape

# Creating loan_category as an intermediate feature for vizualization purpose
a_lc["loan_category"] = np.where((a_lc.loan_status == 'Fully Paid') |
                        (a_lc.loan_status == 'Does not meet the credit policy. Status:Fully Paid'), 'Fully Paid', 'Charged Off')

# Excluding Grace period and late payment statuses as those are ambiguous to predict
a_lc = a_lc.loc[a_lc['loan_status'].isin(['Fully Paid','Charged Off',
                                           'Does not meet the credit policy. Status:Fully Paid',
                                          'Does not meet the credit policy. Status:Charged Off',
                                          'Default'])]

# Sorting the data set by Issue Date
a_lc = a_lc.sort_values(by='issue_d')
a_lc

# Cleaning the data to keep the numeric values
a_lc['home_ownership'] = a_lc['home_ownership'].replace(['NONE', 'ANY'], 'OTHER')
a_lc['term'] = a_lc['term'].replace({'months':''}, regex = True)
a_lc['emp_length'] = a_lc['emp_length'].replace('< 1 year', '0 years')
a_lc['emp_length'] = a_lc['emp_length'].replace('10+ years', '10 years')
a_lc['emp_length'] = a_lc['emp_length'].replace({'year':'', 's':'', '\+':''}, regex = True)

# Checking loan category counts
a_lc['loan_category'].value_counts()

# Create array for using the function for different plots
numeric_var = ['loan_amnt', 'int_rate', 'installment', 'delinq_2yrs','annual_inc','dti',
               'fico_range_low', 'fico_range_high', 'open_acc', 'total_acc', 'pub_rec', 'revol_bal', 'revol_util',
               'tot_coll_amt', 'tot_cur_bal', 'inq_last_6mths', 'inq_fi', 'avg_cur_bal', 'mort_acc',
               'pub_rec_bankruptcies', 'total_bal_ex_mort']

ordinal_var = ['term', 'grade', 'sub_grade', 'emp_length', 'verification_status']

nominal_var = ['home_ownership','purpose', 'title', 'zip_code', 'addr_state']

other = ['id', 'issue_d','loan_category', 'loan_status', 'earliest_cr_line']

#function for numeric features plot
def numeric_plot(v):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 10))
    sns.distplot(a_lc.loc[a_lc[v].notnull(), v], kde=False, ax=ax1)
    ax1.set_title(f'Loan Status vs {v}')
    ax1.set_xlabel(v)
    ax1.set_ylabel('Count')

    sns.boxplot(x='loan_category', y=v, data=a_lc, ax=ax2)
    ax2.set_ylabel('')
    ax2.set_title(v + ' by Loan Status')
    ax2.set_xlabel(v)

    plt.tight_layout()

#function for categorical features plot
def categorical_plot(v):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    sns.countplot(a_lc[v], palette = 'rocket', ax=ax1)
    ax1.set_title(f'Loan Status vs {v}')
    ax1.set_xlabel(v)
    ax1.set_ylabel('Count')


    good_rates = a_lc.groupby(v)['loan_category'].value_counts(normalize=True).loc[:,'Fully Paid']
    sns.barplot(x=good_rates.index, y=good_rates.values, palette = 'rocket', ax=ax2)
    ax2.set_ylabel('Fraction of Good Loans')
    ax2.set_title('Fully Paid Rate by ' + v)
    ax2.set_xlabel(v)
    ax2.set_ylim(0,1)


    plt.tight_layout()
#     order=sorted(a_lc[v].unique())

# Show the plots
for v in numeric_var:
    numeric_plot(v)

#Show the categorical plots
for v in ordinal_var:
    categorical_plot(v)

#Categorical plot for home ownership,purpose, title, zip code and address state
# nominal_var = ['home_ownership','purpose', 'title', 'zip_code', 'addr_state']

categorical_plot('home_ownership')
categorical_plot('purpose')
# categorical_plot('title')
categorical_plot('addr_state')

plt.figure(figsize=(20, 20))
plt.subplot(3, 1, 1)
purpose = sns.countplot(x='purpose', data=a_lc, hue='loan_category')
purpose.set_xticklabels(purpose.get_xticklabels(), rotation=45);

plt.subplot(3, 1, 2)
sns.countplot(x='home_ownership', data=a_lc, hue='loan_category')

plt.subplot(3, 1, 3)
state = sns.countplot(x='addr_state', data=a_lc, hue='loan_category')
state.set_xticklabels(state.get_xticklabels(), rotation=45);

# sns.countplot(x='title', data=a_lc, hue='loan_category')

# Date time extraction
import datetime as dt
def make_dateval(s):
    return dt.datetime.strptime(s[-4:]+s[:3]+'01', '%Y%b%d')
a_lc.issue_d = a_lc.issue_d.apply(make_dateval)
a_lc.earliest_cr_line = a_lc.earliest_cr_line.apply(make_dateval)
a_lc['year'] = a_lc.issue_d.apply(lambda x: x.year)
a_lc['cred_year'] = a_lc.earliest_cr_line.apply(lambda x: x.year)
a_lc['cred_length'] = a_lc['year'] - a_lc['cred_year']

# Calculate yearly count
yearly_count = a_lc.groupby('year')['loan_category'].count().reset_index()
yearly_count = yearly_count.rename(columns={'loan_category': 'counts'})
yearly_count['ratio'] = yearly_count['counts'] / len(a_lc)
yearly_count

# Plot for number of loans per year
fig, (ax1,ax2) = plt.subplots(1, 2, figsize=(12, 6))
sns.countplot(x='year', data=a_lc, ax=ax1)
ax1.set_xlabel('Year', fontsize=14)
ax1.set_ylabel('Count', fontsize=14)


sns.barplot(x='year', y = 'ratio', data=yearly_count, ax=ax2)
ax2.set_xlabel('Year', fontsize=14)
ax2.set_ylabel('Ratio', fontsize=14)

# Plot for Issue year vs Loan Amount
# making issue date as date time

a_lc['issue_d'] = pd.to_datetime(a_lc['issue_d'])
a_lc = a_lc[pd.notnull(a_lc['issue_d'])]
a_lc['issue_year'] = a_lc['issue_d'].dt.year
yamnt = pd.DataFrame({'amount' : a_lc.groupby(['issue_year', 'term']).sum()['loan_amnt']}).reset_index()

plt.figure(figsize=(12,6))
sns.barplot(x="issue_year", y="amount",
             hue="term", palette="rocket", linewidth=2.5,
             data=yamnt)


# plot of DTI vs Year
ydti = pd.DataFrame({'dti' : a_lc.groupby(['year']).mean()['dti']}).reset_index()
plt.figure(figsize=(12,6))
sns.lineplot(x="year", y="dti",
              palette="rocket", linewidth=2.5,
             data=ydti)

# Frequency distribution of Interest Rate
plt.figure(figsize=(12,6))
g = sns.histplot(a_lc, x="int_rate",hue='grade',
    multiple="stack",
    palette="rocket",
    edgecolor=".3",linewidth=.5
    )
g.set_xlabel("Interest Rate", fontsize=12)
g.set_ylabel("Frequency", fontsize=12)
g.set_title("Interest Rate Distribuition", fontsize=10)

plt.figure(figsize=(12,6))
sns.barplot(x="emp_length", y="grade",
             hue="term", palette="rocket", linewidth=2.5,
             data=a_lc)

# Creating target feature loan_class to check Good and Bad Loan. Good is 0 and bad is 1
a_lc["loan_class"] = np.where((a_lc.loan_category == 'Fully Paid'), 0, 1)

# Check counts of loan_class. Imbalanced data set
a_lc.loan_class.value_counts()

# Total Loan Amount Cost
cost = a_lc.groupby('loan_class').loan_amnt.sum().sort_values(ascending=False)

# Showing the amounts in $bn
print("Fully paid loans: $"+str(round(cost[0]/1e9,1))+"bn")
print("Charged off loans: $"+str(round(cost[1]/1e9,1))+"bn")

# What is this as a percentage of the charge off rate, i.e. the cost of risk (CoR)?
default_cost = cost[1]/a_lc['loan_amnt'].sum()
print("cost of default is "+str(round(default_cost*100,1))+"%")

# Remove these columns based on data analysis
a_lc.drop(['grade','issue_d', 'loan_status', 'zip_code', 'earliest_cr_line', 'cred_year', 'title'], axis=1, inplace=True)

# Evaluating the loan counts for different states
state_count = a_lc.groupby(['addr_state']).count().reset_index()
state_count = state_count[['addr_state','loan_class']]
state_count.rename(columns={'loan_class': 'counts'})

unique_count = a_lc.groupby(['addr_state','loan_class']).count().reset_index()
unique_count = unique_count[['addr_state','loan_class', 'id']]
m = pd.merge(unique_count, state_count, on="addr_state", how = "left")
m.rename(columns={'loan_class_y': 'total'})

plt.figure(figsize=(20,6))
sns.barplot(x="addr_state", y="ratio",
             hue="loan_class_x", palette="rocket", linewidth=2.5,
             data=m) #drop state

#Encoding

print([column for column in a_lc.columns if a_lc[column].dtype == 'object'])
a_lc.drop(['id','addr_state'], axis =1, inplace=True)
sub_grades = sorted(a_lc.sub_grade.unique())
grades = dict((i,j) for i,j in enumerate(sub_grades, start=1))
grades = {grades[k]:k for k in grades}

a_lc_wo = a_lc.copy()
a_lc['sub_grade'] = a_lc['sub_grade'].map(grades)
verification_status = {'Not Verified':0, 'Source Verified':1, 'Verified':2}
a_lc['verification_status'] = a_lc['verification_status'].map(verification_status)

# Dummy variable created for the following features
categorical = ['home_ownership', 'purpose']
a_lc =pd.get_dummies(a_lc, columns=categorical, drop_first=True)
print([column for column in a_lc.columns if a_lc[column].dtype == 'object'])
a_lc[['term', 'emp_length']] = a_lc[['term', 'emp_length']].apply(pd.to_numeric)

#Feature Engineering
a_lc['term_amnt'] = a_lc['loan_amnt']*a_lc['term'] #in order to calculate weighted average. longer term => takes longer to pay
weighted_average = a_lc['term_amnt'].sum()/a_lc['loan_amnt'].sum()

print(f'Average term: {round(a_lc.term.mean(),2)} months')
print(f'Weighted average term amount: {str(round(weighted_average,2))} months')

a_lc['int_amnt'] = a_lc['int_rate']*a_lc['loan_amnt']
cumulative_avg_int_rate = a_lc['int_amnt'].sum()/a_lc['loan_amnt'].sum()

print(f'Average interest rate: {str(round(a_lc.int_rate.mean(),2))}%')
print(f'Weighted av. rate: {str(round(cumulative_avg_int_rate,2))}%')
a_lc.drop('loan_category', axis=1,inplace=True)
a_lc['loan_class'].value_counts()

# Countplot of Good Loans and Bad Loans
g= sns.countplot(a_lc["loan_class"], palette="rocket")
g.set_xticklabels(g.get_xticklabels(),rotation=0)
g.set_title("Good Loan and Bad Loan", fontsize=20)
g.set_xlabel("Loan Class", fontsize=15)
g.set_ylabel("Frequency", fontsize=15)

#Create pickle for cleaned data
loans = a_lc.copy()
import pickle
with open('cleaned_data.pkl', 'wb') as pickle_file:
    pickle.dump(loans, pickle_file)
#split loans into train test groups based on year. Define X_train/test, Y_train/test

loans_train = loans[loans['year'] < 2018]
loans_test = loans[loans['year'] == 2018]

X_train = loans_train.loc[:, loans_train.columns != "loan_class"]
Y_train = loans_train["loan_class"]

X_test = loans_test.loc[:, loans_test.columns != "loan_class"]
Y_test = loans_test["loan_class"]

X_train.drop('year', axis =1)
X_test.drop('year', axis =1)
X_train.shape, X_test.shape
with open('loans_test.pkl', 'wb') as pickle_file:
 pickle.dump(loans_test, pickle_file)


# Performance Metrics Calculation
# Assuming cost of funding is 3%
CoF =3
#gives back necessary financial metrics of loan data for comparison
def return_stats(df, title):
    """
    Function to return all of the financial returns data for comparing the models
    returns: the total book return on equity in %
    """

    weighted_term = df['term_amnt'].sum() / df['loan_amnt'].sum()
    weighted_interest = df['int_amnt'].sum() / df['loan_amnt'].sum()
    charge_off_rate = df['loan_class'].value_counts(normalize=True)
    charge_off_amount = df.groupby('loan_class').loan_amnt.sum() #dollar amount of each loan_class
    total_CoR = charge_off_amount[1] / df['loan_amnt'].sum() #dollar amount of default loans

    loans_approved = df.loan_amnt.sum() #total dollar amount of loans
    loans_charged_off = charge_off_amount[1] #total dollar amount of default loans

    # And therefore, the ROI:
    loans_returned = loans_approved - loans_charged_off #total loan amount paid back
    interest_earned = loans_returned * (weighted_interest) / 100 * (weighted_term / 12) #interest amount per year
    interest_paid = loans_approved * CoF / 100 * (weighted_term / 12) #CoF (cost of funds)
    profit = interest_earned - interest_paid - loans_charged_off
    ROI = profit / loans_approved


    print(title)
    print(f'Weighted average term amount: {str(round(weighted_term,2))} months')
    print(f'Weighted av. rate: {str(round(weighted_interest,2))}%')
    print(f'Total cost of risk due to default: {str(round(total_CoR * 100, 2))}%')
    print(f'Total loan book is: $ {str(round(loans_approved / 1e6))} M')
    print(f'Interest earned is: $ {str(round(interest_earned / 1e6, 0))} M')
    print(f'Interest paid is: $ {str(round(interest_paid / 1e6, 0))} M')
    print(f'Profit is: $ {str(round(profit / 1e6, 0))} M')
    print(f'Total book ROI: {str(round(ROI * 100, 2))} %')

    return weighted_term, weighted_interest, charge_off_rate, charge_off_amount, total_CoR, ROI

# High level Performance metrics of all Loans
weighted_term, weighted_interest, charge_off_rate, charge_off_amount, total_CoR, ROI = return_stats(loans, 'Total book')
# High level Performance metrics of train data
weighted_term, weighted_interest, charge_off_rate, charge_off_amount, total_CoR, ROI = return_stats(loans_train, 'Total book')
# High level Performance metrics of test data
weighted_term_test, weighted_interest_test, charge_off_rate_test, charge_off_amount_test, total_CoR_test, ROI_test = return_stats(loans_test, 'Test set loans')

def model_performance(y_pred, column, total_CoR_test, df, CoF):
    """
    Compares model prediction metrics against the metrics of the test set loans as well as CoR
    y_pred - the model prediction of loan class
    Returns interest rate, cost of risk and ROI of the book in %
    """
    # Adding new column with the predicted charge off rate
    df[column] = y_pred
    y_test_paid = df.loc[df[column].isin([0])]
    performance = y_test_paid.groupby('loan_class').loan_amnt.sum()

    # CoR is the proportion that was charged off:
    loans_approved = performance.sum()
    loans_charged_off = performance[1]
    CoR = loans_charged_off / loans_approved

    saving = df['loan_amnt'].sum() * (total_CoR_test - CoR)

    weighted_interest = y_test_paid['int_amnt'].sum() / y_test_paid['loan_amnt'].sum()
    weighted_term = y_test_paid['term_amnt'].sum() / y_test_paid['loan_amnt'].sum()

    # ROI
    loans_returned = loans_approved - loans_charged_off
    interest_earned = loans_returned * weighted_interest / 100 * (weighted_term / 12)
    interest_paid = loans_approved * CoF / 100 * (weighted_term / 12)
    profit = interest_earned - interest_paid - loans_charged_off
    ROI = profit / loans_approved

    print('Total loans approved = $' + str(round(loans_approved / 1e9, 3)) + 'bn')
    print('Total loans charged off = $' + str(round(loans_charged_off / 1e9, 3)) + 'bn')
    print('Modelled cost of risk is: ' + str(round(CoR * 100, 2)) + "%")
    print("Total saving = $" + str(round(saving / 1e6, 1)) + "m\n")
    print('Weighted average interest rate: ' + str(round(weighted_interest, 2)) + "%")
    print('Weighted average term: ' + str(round(weighted_term, 2)) + " months\n")
    print('Interest earned: $' + str(round(interest_earned / 1e6, 0)) + "m")
    print('Interest paid: $' + str(round(interest_paid / 1e6, 0)) + "m")
    print('Profit is: $' + str(round(profit / 1e6, 0)) + "m")
    print('Return on investment is: ' + str(round(ROI * 100, 2)) + "%")

    return weighted_interest, CoR, ROI

# Machine Learning - No tuning

# scaling the dataset
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test=sc.transform(X_test)

# Applying Logistic Regression without class weight/sampling
logistic= LogisticRegression()
temp=logistic.fit(X_train, Y_train)

Y_pred= logistic.predict(X_test)

# Summary of the prediction
print(classification_report(Y_test, Y_pred))
#print(confusion_matrix(Y_test, Y_pred))
conf_matrix = confusion_matrix(Y_test, Y_pred)
# Accuracy
print("Accuracy of the model is: ", accuracy_score(Y_pred,Y_test))


ax = sns.heatmap(conf_matrix, cmap='viridis_r', annot=True, fmt='d', square=True)
ax.set_title('Logistic regression performance')
ax.set_xlabel('Predicted')
ax.set_ylabel('True')

fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)

roc_auc = auc(fpr,tpr)

# Plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Apply performance metrics function
logistic_int_rate, logistic_CoR, logistic_ROI = model_performance(Y_pred,'logistic model (no tuning)', total_CoR_test, loans_test, CoF)


# Apply XGBoosting model on train data
import xgboost as xgb
xg = xgb.XGBClassifier()

n_estimators = 100
xg.set_params(max_depth=4, n_estimators=n_estimators)
train_r2 = []

steps = range(100,1000,10)

xg.fit(X_train,Y_train)
Y_pred = xg.predict(X_test)

# Summary of the prediction
print(classification_report(Y_test, Y_pred))
#print(confusion_matrix(Y_test, Y_pred))
conf_matrix = confusion_matrix(Y_test, Y_pred)
# Accuracy
print("Accuracy of the model is: ", accuracy_score(Y_pred,Y_test))

ax = sns.heatmap(conf_matrix, cmap='viridis_r', annot=True, fmt='d', square=True)
ax.set_title('XGBoost performance')
ax.set_xlabel('Predicted')
ax.set_ylabel('True')

with open('y_pred_xg.pkl', 'wb') as pickle_file:
    pickle.dump(Y_pred, pickle_file)

# ROC-AUC for XGBoost
fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)

roc_auc = auc(fpr,tpr)

# Plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Apply performance metrics function on XGBoost
xg_int_rate, xg_CoR, xg_ROI = model_performance(Y_pred,'XGBoost (no tuning)', total_CoR_test, loans_test, CoF)

#Apply Catboost algorithm
from catboost import CatBoostClassifier


clf = CatBoostClassifier(
    learning_rate=0.1,
    #loss_function='CrossEntropy'
)

clf.fit(X_train,Y_train)
Y_pred = clf.predict(X_test)

# Summary of the prediction
print(classification_report(Y_test, Y_pred))
#print(confusion_matrix(Y_test, Y_pred))
conf_matrix = confusion_matrix(Y_test, Y_pred)
# Accuracy
print("Accuracy of the model is: ", accuracy_score(Y_pred,Y_test))

ax = sns.heatmap(conf_matrix, cmap='viridis_r', annot=True, fmt='d', square=True)
ax.set_title('Catboost performance')
ax.set_xlabel('Predicted')
ax.set_ylabel('True')

# ROC-AUC for CatBoost
fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)

roc_auc = auc(fpr,tpr)

# Plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

# Apply performance metrics function on CatBoost
clf_int_rate, clf_CoR, clf_ROI = model_performance(Y_pred,'CatBoost', total_CoR_test, loans_test, CoF)

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 500, criterion = 'gini')
rf.fit(X_train, Y_train)
Y_Pred= rf.predict(X_test)
# rf_probs = rf.predict_proba(Y_test)[:, 1]


print(classification_report(Y_test, Y_Pred))
#print(confusion_matrix(Y_test, rf_predictions))
conf_matrix = confusion_matrix(Y_test, Y_Pred)

# Accuracy
print("Accuracy of the model is: ", accuracy_score(Y_Pred,Y_test))
# print("Precision score: ", precision_score(Y_test,Y_Pred))
# print("Recall score: {}", recall_score(Y_test,Y_Pred))

# roc_auc_score(Y_test, rf_probs)

ax = sns.heatmap(conf_matrix, cmap='viridis_r', annot=True, fmt='d', square=True)
ax.set_title('RandomForest performance')
ax.set_xlabel('Predicted')
ax.set_ylabel('True')

# Random Forest
Y_Pred= rf.predict(X_test)
print(classification_report(Y_test, Y_Pred))
#print(confusion_matrix(Y_test, rf_predictions))
conf_matrix = confusion_matrix(Y_test, Y_Pred)

# Accuracy
print("Accuracy of the model is: ", accuracy_score(Y_Pred,Y_test))
#print("Precision score: {}".format(precision_score(Y_test,Y_Pred)))
print("Recall score: {}".format(recall_score(Y_test,Y_Pred)))

# roc_auc_score(Y_test, rf_probs)

ax = sns.heatmap(conf_matrix, cmap='viridis_r', annot=True, fmt='d', square=True)
ax.set_title('RandomForest performance')
ax.set_xlabel('Predicted')
ax.set_ylabel('True')

# ROC-AUC for Random Forest
fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)

roc_auc = auc(fpr,tpr)

# Plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Apply performance metrics function on RandomForest
rf_int_rate, rf_CoR, rf_ROI = model_performance(Y_pred,'R Forest', total_CoR_test, loans_test, CoF)

# Naive Bayes
naive= GaussianNB()
naive.fit(X_train, Y_train)

Y_pred= naive.predict(X_test)

# Summary of prediction
print(classification_report(Y_test, Y_pred))
#print(confusion_matrix(Y_test, Y_pred))
conf_matrix = confusion_matrix(Y_test, Y_Pred)

# Accuracy score
print("Accuracy of the model is: ", accuracy_score(Y_pred,Y_test))

ax = sns.heatmap(conf_matrix, cmap='viridis_r', annot=True, fmt='d', square=True)
ax.set_title('RandomForest performance')
ax.set_xlabel('Predicted')
ax.set_ylabel('True')

# ROC-AUC for Naive Bayes

fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)

roc_auc = auc(fpr,tpr)

# Plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Apply performance metrics function on Naive Bayes
naive_int_rate, naive_CoR, naive_ROI = model_performance(Y_pred,'Naive Bayes', total_CoR_test, loans_test, CoF)

from sklearn.metrics import precision_recall_curve
def threshold_calculation(model, threshold_list):
#     pred_proba_df = pd.DataFrame(model.predict_proba(X_test))
    for i in threshold_list:
        print ('\n******** For i = {} ******'.format(i))
        Y_test_pred = (model.predict_proba(X_test)[:,1]>i).astype("int32")
        test_accuracy = accuracy_score(Y_test, Y_test_pred)
        print('Our testing accuracy is {}'.format(test_accuracy))
        print(confusion_matrix(Y_test, Y_test_pred))


    #plot precision/recall curve against different thresholds

    pred_y = model.predict(X_test)
    probs_y = model.predict_proba(X_test) # probs_y is probability of being labeled as 0 (first column of array) vs 1 (2nd column in array)
    precision, recall, thresholds = precision_recall_curve(Y_test, probs_y[:, 1])

    #retrieve probability of being 1(in second column of probs_y)
    pr_auc = auc(recall, precision)

    plt.subplot(1, 2, 1)
    plt.title("Precision-Recall vs Threshold Chart")
    plt.plot(thresholds, precision[: -1], "b--", label="Precision")
    plt.plot(thresholds, recall[: -1], "r--", label="Recall")
    plt.ylabel("Precision, Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="lower left")
    plt.ylim([0,1])


    plt.subplot(1, 2, 2)
    no_skill = len(Y_test[Y_test==1]) / len(Y_test)
    plt.title("Precision-Recall")
    plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
    plt.plot(recall, precision, marker='.', label='Logistic')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend()
#Imbalanced Data - Tuning with Class Weight and Undersampling

# Exploring best class weight for logistic regression
from sklearn.model_selection import GridSearchCV,StratifiedKFold
import collections, numpy

lr = LogisticRegression()
# n = collections.Counter(Y_train)
# class_weights = {0:1 - n[0]/(n[0]+n[1]), 1:1 - n[1]/(n[0]+n[1])}
# penalty = ['l1', 'l2']
# C = np.logspace(0, 4, 10)
# param_grid = dict(C=C, penalty=penalty)


#Setting the range for class weights
weights = np.linspace(0.0,0.5)

#Creating a dictionary grid for grid search
param_grid = {'class_weight': [{0:x, 1:1.0-x} for x in weights]}


#Fitting grid search to the train data with 5 folds
gridsearch = GridSearchCV(estimator= lr,
                          param_grid= param_grid,
                          cv=StratifiedKFold(),
                          n_jobs=-1,
                          scoring='f1',
                          verbose=2).fit(X_train, Y_train)


# logistic_cw = LogisticRegression(class_weight = class_weights, max_iter=1000)
# logistic_cw_grid = GridSearchCV(logistic_cw, param_grid, cv=5, scoring='roc_auc', verbose=10, n_jobs=-1)
# logistic_cw_grid.fit(X_train, Y_train)
#Ploting the score for different values of weight
sns.set_style('whitegrid')
plt.figure(figsize=(12,8))
weigh_data = pd.DataFrame({ 'score': gridsearch.cv_results_['mean_test_score'], 'weight': (1- weights)})
sns.lineplot(weigh_data['weight'], weigh_data['score'])
plt.xlabel('Weight for class 1')
plt.ylabel('F1 score')
plt.xticks([round(i/10,1) for i in range(0,11,1)])
plt.title('Scoring for different class weights', fontsize=24)
#Ploting the score for different values of weight
sns.set_style('whitegrid')
plt.figure(figsize=(12,8))
weigh_data = pd.DataFrame({ 'score': gridsearch.cv_results_['mean_test_score'], 'weight': (1- weights)})
sns.lineplot(weigh_data['weight'], weigh_data['score'])
plt.xlabel('Weight for class 1')
plt.ylabel('F1 score')
plt.xticks([round(i/10,1) for i in range(0,11,1)])
plt.title('Scoring for different class weights', fontsize=24)

# Exploring class-weight - Applying Logistic Regression with class_weight

n = Y_train.value_counts()
#class_weights = {0:1- n[0]/n.sum(), 1:1- n[1]/n.sum()}
# class_weights
# Applying Logistic Regression on class weights (final ratio)
from sklearn.metrics import precision_score
logistic_cw = LogisticRegression(class_weight = {0: 0.2040816326530612, 1: 0.7959183673469388})
logistic_cw.fit(X_train, Y_train)

Y_pred_cw = logistic_cw.predict(X_test)

# Summary of the prediction
print(classification_report(Y_test, Y_pred_cw))
print(confusion_matrix(Y_test, Y_pred_cw))

# Accuracy
print("Accuracy of the model is: ", accuracy_score(Y_pred_cw,Y_test))
print("Precision score: {}".format(precision_score(Y_test,Y_pred_cw)))
print("Recall score: {}".format(recall_score(Y_test,Y_pred_cw)))
conf_matrix = confusion_matrix(Y_test, Y_pred_cw)
roc_auc_score(Y_test, logistic_cw.predict_proba(X_test)[:,1])
# ROC-AUC for Logistic regression + Class Weight

fpr, tpr, thresholds = roc_curve(Y_test, Y_pred_cw)

roc_auc = auc(fpr,tpr)

# Plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Performance metrics for Logistic regression + Class Weight
logistic_cw_int_rate, logistic_cw_CoR, logistic_cw_RoE = model_performance(Y_pred_cw,'logistic model (class weights)', total_CoR_test, loans_test, CoF)


# Apply XGBoosting model on train data with scale_pos_weight
import xgboost as xgb
xg = xgb.XGBClassifier()

n_estimators = 100
xg.set_params(max_depth=4, n_estimators=n_estimators,scale_pos_weight=10)
train_r2 = []

steps = range(100,1000,10)

xg.fit(X_train,Y_train)
Y_pred = xg.predict(X_test)

# Summary of the prediction
print(classification_report(Y_test, Y_pred))
#print(confusion_matrix(Y_test, Y_pred))
conf_matrix = confusion_matrix(Y_test, Y_pred)
# Accuracy
print("Accuracy of the model is: ", accuracy_score(Y_pred,Y_test))

ax = sns.heatmap(conf_matrix, cmap='viridis_r', annot=True, fmt='d', square=True)
ax.set_title('XGBoost performance')
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
plt.show()
# ROC-AUC for XGBoost - Tuned

fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)

roc_auc = auc(fpr,tpr)

# Plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
xg_cw_int_rate, xg_cw_CoR, xg_cw_RoE = model_performance(Y_pred_cw,'XGBoost (class weights)', total_CoR_test, loans_test, CoF)

#Neural Net
# Apply Neural Network algorithm

import tensorflow.python.keras
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.metrics import categorical_crossentropy
from tensorflow.python.keras.layers import Dense, Activation,Dropout
#from tensorflow.python.keras.optimizers import SGD

input_nodes = X_train.shape[1]
#output_nodes = 1

model = Sequential()

#model.add(Input((input_nodes,)))
model.add(Dense(units=16,activation="sigmoid"))
model.add(Dense(units=32, activation="sigmoid"))
model.add(Dense(units=2, activation="softmax"))
#model.add(Dense(output_nodes))
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=['accuracy'])

model.fit(X_train, Y_train, batch_size=20, validation_split=0.1, epochs=50,verbose=2)
Y_pred = model.predict(X_test)
accuracy = accuracy_score(Y_test, np.argmax(Y_pred, axis=1))
print('Accuracy: {0:.2f}'.format(accuracy * 100.0))

print('Classification Report:')
print(classification_report(Y_test, np.argmax(Y_pred, axis=1)))
print('Confusion Matrix:')
#print(confusion_matrix(Y_test, np.argmax(Y_pred, axis=1)))

conf_matrix = confusion_matrix(Y_test, np.argmax(Y_pred, axis=1))
# Accuracy
#print("Accuracy of the model is: ", accuracy_score(Y_pred_us,Y_test))

ax = sns.heatmap(conf_matrix, cmap='viridis_r', annot=True, fmt='d', square=True)
ax.set_title('Neural Net performance')
ax.set_xlabel('Predicted')
ax.set_ylabel('True')

# ROC-AUC for Neural Net
fpr, tpr, thresholds = roc_curve(Y_test, np.argmax(Y_pred, axis=1))

roc_auc = auc(fpr,tpr)

# Plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
nn_int_rate, nn_us_CoR, nn_us_RoE = model_performance(np.argmax(Y_pred, axis=1),'Neural Net', total_CoR_test, loans_test, CoF)

#Simulation

#gives back necessary financial metrics of loan data for comparison
def ROI_total(df):
    """
    Function to return all of the financial returns data for comparing the models
    returns: the total book return on equity in %
    """

    weighted_term = df['term_amnt'].sum() / df['loan_amnt'].sum()
    weighted_interest = df['int_amnt'].sum() / df['loan_amnt'].sum()
    charge_off_rate = df['loan_class'].value_counts(normalize=True)
    charge_off_amount = df.groupby('loan_class').loan_amnt.sum() #dollar amount of each loan_class
    total_CoR = charge_off_amount[1] / df['loan_amnt'].sum() #dollar amount of default loans

    loans_approved = df.loan_amnt.sum() #total dollar amount of loans
    loans_charged_off = charge_off_amount[1] #total dollar amount of default loans

    # And therefore, the RoE:
    loans_returned = loans_approved - loans_charged_off #total loan amount paid back
    interest_earned = loans_returned * (weighted_interest) / 100 * (weighted_term / 12) #interest amount per year
    interest_paid = loans_approved * CoF / 100 * (weighted_term / 12) #CoF (cost of funds)
    profit = interest_earned - interest_paid - loans_charged_off
    ROI = profit / loans_approved
    return ROI

from sklearn.utils import shuffle
def ROI_model(y_pred, column, n):
    """
    Compares model prediction metrics against the metrics of the test set loans as well as CoR
    y_pred - the model prediction of loan class
    Returns interest rate, cost of risk and ROI of the book in %
    """
    df = loans_test
    COF = 3
    a, b, c, d, e, f, g = n[0],n[1],n[2],n[3],n[4],n[5],n[6]
    # Adding new column with the predicted charge off rate
    df[column] = y_pred
    y_test_paid = df.loc[df[column].isin([0])]
    a_loans = shuffle(y_test_paid.loc[(y_test_paid['sub_grade'] >= 1) & (y_test_paid['sub_grade'] <= 5)])[0:a]
    b_loans = shuffle(y_test_paid.loc[(y_test_paid['sub_grade'] >= 6) & (y_test_paid['sub_grade'] <= 10)])[0:b]
    c_loans = shuffle(y_test_paid.loc[(y_test_paid['sub_grade'] >= 11) & (y_test_paid['sub_grade'] <= 15)])[0:c]
    d_loans = shuffle(y_test_paid.loc[(y_test_paid['sub_grade'] >= 16) & (y_test_paid['sub_grade'] <= 20)])[0:d]
    e_loans = shuffle(y_test_paid.loc[(y_test_paid['sub_grade'] >= 21) & (y_test_paid['sub_grade'] <= 25)])[0:e]
    f_loans = shuffle(y_test_paid.loc[(y_test_paid['sub_grade'] >= 26) & (y_test_paid['sub_grade'] <= 30)])[0:f]
    g_loans = shuffle(y_test_paid.loc[(y_test_paid['sub_grade'] >= 31) & (y_test_paid['sub_grade'] <= 35)])[0:g]

    y_test_paid = pd.concat([a_loans, b_loans, c_loans, d_loans, e_loans, f_loans, g_loans])


    performance = y_test_paid.groupby('loan_class').loan_amnt.sum()


    # CoR is the proportion that was charged off:
    loans_approved = performance.sum()
    loans_charged_off = performance[1]
    CoR = loans_charged_off / loans_approved

    saving = df['loan_amnt'].sum() * (total_CoR_test - CoR)

    weighted_interest = y_test_paid['int_amnt'].sum() / y_test_paid['loan_amnt'].sum()
   weighted_term = y_test_paid['term_amnt'].sum() / y_test_paid['loan_amnt'].sum()

    # ROI
    loans_returned = loans_approved - loans_charged_off
    interest_earned = loans_returned * weighted_interest / 100 * (weighted_term / 12)
    interest_paid = loans_approved * CoF / 100 * (weighted_term / 12)
    profit = interest_earned - interest_paid - loans_charged_off
    ROI = (profit / loans_approved)
   return ROI
ROI_model(y_pred,'logistic model (class weights)', n)

# Model ROI calculation on shuffled loans

model_ROI = []
lc_ROI = []

for i in range(10):
    loan_shuffled = shuffle(loans)
    l_ROI = ROI_total(loan_shuffled[0:100])
    lc_ROI.append(l_ROI)

    y_pred = modeling(X_train, Y_train)
    m_ROI = ROI_model(y_pred,'logistic model (class weights)', n)
    model_ROI.append(m_ROI)

ROI_df = pd.DataFrame()
ROI_df['model_ROI'] = model_ROI
ROI_df['lc_ROI'] = lc_ROI

# Plot histogram of of benchmark and Model ROI
ROI_df = pd.DataFrame()
ROI_df['model_ROI'] = model_ROI
ROI_df['lc_ROI'] = lc_ROI

fig, ax = plt.subplots(figsize=(14,7))
ax = sns.distplot(tuple(ROI_df['model_ROI']))
ax = sns.distplot(tuple(ROI_df['lc_ROI']))


ax.set_xlabel("Rate of Return",fontsize=16)
ax.set_ylabel("Frequency",fontsize=16)
plt.legend(labels=["Model ROI", 'Benchmark (Lending Club) ROI'], fontsize=16)
plt.savefig(fname='ROI_Hist', dpi=150)
plt.show()