HarshSingh16/Predicitng Customer Default.R

## Predicitng Customer Default.R
if("pacman" %in% rownames(installed.packages()) == FALSE) {install.packages("pacman")} # Check if you have universal installer package, install if not

pacman::p_load("caret","ROCR","lift","glmnet","MASS","e1071","readxl") #Check, and if needed install the necessary packages
###########LOADING THE CREDIT DATA FILE
creditdata1<-read_excel(file.choose())
names(creditdata1)[1]<-"CurrentId"
str(creditdata1)
creditdata1$`OLD SYSTEM ID`<-NULL
###########LOADING THE NEW APPLICATIONS FILE
NEWAPPLICATIONS<-read.csv(choose.files())
###########THE NEXT 25 LINES JUST CONVERT THE TYPES OF NEWAPPLICATIONS.CSV TO MATCH THAT OF CREDITDATA. PROPER VARIABLE CONVERSIONS HAVE BEEN PERFORMED LATER ONCE THE TWO DATASETS HAVE BEEN COMINED USING RBIND
names(NEWAPPLICATIONS)[1]<-"CurrentId"
NEWAPPLICATIONS$LIMIT_BAL<-as.character(NEWAPPLICATIONS$LIMIT_BAL)
NEWAPPLICATIONS$SEX<-as.numeric(NEWAPPLICATIONS$SEX)
NEWAPPLICATIONS$EDUCATION<-as.numeric(NEWAPPLICATIONS$EDUCATION)
NEWAPPLICATIONS$MARRIAGE<-as.numeric(NEWAPPLICATIONS$MARRIAGE)
NEWAPPLICATIONS$AGE<-as.numeric(NEWAPPLICATIONS$AGE)
NEWAPPLICATIONS$PAY_0<-as.numeric(NEWAPPLICATIONS$PAY_0)
NEWAPPLICATIONS$PAY_2<-as.numeric(NEWAPPLICATIONS$PAY_2)
NEWAPPLICATIONS$PAY_3<-as.numeric(NEWAPPLICATIONS$PAY_3)
NEWAPPLICATIONS$PAY_4<-as.numeric(NEWAPPLICATIONS$PAY_4)
NEWAPPLICATIONS$PAY_5<-as.numeric(NEWAPPLICATIONS$PAY_5)
NEWAPPLICATIONS$PAY_6<-as.numeric(NEWAPPLICATIONS$PAY_6)
NEWAPPLICATIONS$BILL_AMT1<-as.numeric(NEWAPPLICATIONS$BILL_AMT1)
NEWAPPLICATIONS$BILL_AMT2<-as.numeric(NEWAPPLICATIONS$BILL_AMT2)
NEWAPPLICATIONS$BILL_AMT3<-as.numeric(NEWAPPLICATIONS$BILL_AMT3)
NEWAPPLICATIONS$BILL_AMT4<-as.numeric(NEWAPPLICATIONS$BILL_AMT4)
NEWAPPLICATIONS$BILL_AMT5<-as.numeric(NEWAPPLICATIONS$BILL_AMT5)
NEWAPPLICATIONS$BILL_AMT6<-as.numeric(NEWAPPLICATIONS$BILL_AMT6)
NEWAPPLICATIONS$PAY_AMT1<-as.character(NEWAPPLICATIONS$PAY_AMT1)
NEWAPPLICATIONS$PAY_AMT2<-as.numeric(NEWAPPLICATIONS$PAY_AMT2)
NEWAPPLICATIONS$PAY_AMT3<-as.character(NEWAPPLICATIONS$PAY_AMT3)
NEWAPPLICATIONS$PAY_AMT4<-as.character(NEWAPPLICATIONS$PAY_AMT4)
NEWAPPLICATIONS$PAY_AMT5<-as.character(NEWAPPLICATIONS$PAY_AMT5)
NEWAPPLICATIONS$PAY_AMT6<-as.numeric(NEWAPPLICATIONS$PAY_AMT6)

str(NEWAPPLICATIONS)
NEWAPPLICATIONS[1,1]<-15001
NEWAPPLICATIONS[2,1]<-15002
NEWAPPLICATIONS$default.payment.next.month<-NA
creditdata<-rbind(creditdata1,NEWAPPLICATIONS)

str(creditdata)
str(NEWAPPLICATIONS)
creditdata$SEX<-as.factor(creditdata$SEX)
creditdata$EDUCATION<-as.factor(creditdata$EDUCATION)
creditdata$MARRIAGE<-as.factor(creditdata$MARRIAGE)
str(creditdata)
creditdata$PAY_0<-as.factor(creditdata$PAY_0)
creditdata$PAY_2<-as.factor(creditdata$PAY_2)
creditdata$PAY_3<-as.factor(creditdata$PAY_3)
creditdata$PAY_4<-as.factor(creditdata$PAY_4)
creditdata$PAY_5<-as.factor(creditdata$PAY_5)
creditdata$PAY_6<-as.factor(creditdata$PAY_6)
creditdata$LIMIT_BAL<-as.numeric(creditdata$LIMIT_BAL)
creditdata$BILL_AMT1<-as.numeric(creditdata$BILL_AMT1)
creditdata$BILL_AMT2<-as.numeric(creditdata$BILL_AMT2)
creditdata$BILL_AMT3<-as.numeric(creditdata$BILL_AMT3)
creditdata$BILL_AMT4<-as.numeric(creditdata$BILL_AMT4)
creditdata$BILL_AMT5<-as.numeric(creditdata$BILL_AMT5)
creditdata$BILL_AMT6<-as.numeric(creditdata$BILL_AMT6)

creditdata$PAY_AMT1<-as.numeric(creditdata$PAY_AMT1)
creditdata$PAY_AMT2<-as.numeric(creditdata$PAY_AMT2)
creditdata$PAY_AMT3<-as.numeric(creditdata$PAY_AMT3)
creditdata$PAY_AMT4<-as.numeric(creditdata$PAY_AMT4)
creditdata$PAY_AMT5<-as.numeric(creditdata$PAY_AMT5)
creditdata$PAY_AMT6<-as.numeric(creditdata$PAY_AMT6)
str(creditdata)

creditdata$AGE<-as.numeric(creditdata$AGE)
creditdata$default.payment.next.month<-as.factor(creditdata$default.payment.next.month)
str(creditdata)

######CHECKING NA's
sapply(creditdata, function(x)sum(is.na(x)))


#####SEPERATING THE OBSERVATIONS TO PREDICT FROM THE REST
tail(creditdata)
CREDIT1<-subset(creditdata,CurrentId<=15000)
CREDIT2<-subset(creditdata,CurrentId>15000)

###################Creating Training and Test Data
set.seed(77850) #set a random number generation seed to ensure that the split is the same everytime
inTrain <- createDataPartition(y = CREDIT1$default.payment.next.month,
                               p = 12000/15000, list = FALSE)
training <- CREDIT1[ inTrain,]
testing <- CREDIT1[ -inTrain,]
str(testing)

#########################Training the model using logisitc Regression
library(glmnet)
model_logistic<-glm(default.payment.next.month~.-CurrentId-SEX, data=training, family="binomial"(link="logit"))

summary(model_logistic)

model_logistic_stepwiseAIC<-model_logistic

summary(model_logistic_stepwiseAIC)

par(mfrow=c(1,4))
plot(model_logistic_stepwiseAIC) #Error plots: similar nature to lm plots
par(mfrow=c(1,1))
str(testing)
par(mar=c(6,4,4,2))
###Finding predicitons: probabilities and classification
logistic_probabilities<-predict(model_logistic_stepwiseAIC,newdata=testing,type="response") #Predict probabilities
logistic_classification<-rep("1",2999)
logistic_classification[logistic_probabilities<0.5]="0" #Predict classification using 0.6073 threshold. Why 0.6073 - that's the average probability of being retained in the data. An alternative code: logistic_classification <- as.integer(logistic_probabilities > mean(testing$Retained.in.2012. == "1"))
logistic_classification<-as.factor(logistic_classification)

str(testing)
str(training)

###Confusion matrix
confusionMatrix(logistic_classification,testing$default.payment.next.month) #Display confusion matrix

####ROC Curve
logistic_ROC_prediction <- prediction(logistic_probabilities, testing$default.payment.next.month)
logistic_ROC <- performance(logistic_ROC_prediction,"tpr","fpr") #Create ROC curve data
plot(logistic_ROC) #Plot ROC curve

####AUC (area under curve)
auc.tmp <- performance(logistic_ROC_prediction,"auc") #Create AUC data
logistic_auc_testing <- as.numeric(auc.tmp@y.values) #Calculate AUC
logistic_auc_testing #Display AUC value: 90+% - excellent, 80-90% - very good, 70-80% - good, 60-70% - so so, below 60% - not much value


###################################Training the model using Random Forest
str(training)
library(randomForest)
model_forest <- randomForest(default.payment.next.month~ .-CurrentId, data=training,
                             importance=TRUE,proximity=TRUE,
                             cutoff = c(0.5, 0.5),type="classification") #cutoffs need to be determined for class 0 and class 1. By default 50/50, but need not be those necessarily
print(model_forest)
plot(model_forest)
importance(model_forest)
varImpPlot(model_forest)

###Finding predicitons: probabilities and classification
forest_probabilities<-predict(model_forest,newdata=testing,type="prob") #Predict probabilities -- an array with 2 columns: for not retained (class 0) and for retained (class 1)
forest_classification<-rep("1",2999)
forest_classification[forest_probabilities[,2]<0.5]="0" #Predict classification using 0.5 threshold. Why 0.5 and not 0.6073? Use the same as in cutoff above
forest_classification<-as.factor(forest_classification)

confusionMatrix(forest_classification,testing$default.payment.next.month) #Display confusion matrix. Note, confusion matrix actually displays a better accuracy with threshold of 50%

#There is also a "shortcut" forest_prediction<-predict(model_forest,newdata=testing, type="response")
#But it by default uses threshold of 50%: actually works better (more accuracy) on this data


####ROC Curve
forest_ROC_prediction <- prediction(forest_probabilities[,2], testing$default.payment.next.month) #Calculate errors
forest_ROC <- performance(forest_ROC_prediction,"tpr","fpr") #Create ROC curve data
plot(forest_ROC) #Plot ROC curve
plot(forest_ROC, add=TRUE, col="blue") #For comparison, overlay/add the ROC curve from (A) in red
legend("right", legend=c("Logistic","Random Forest"), col=c("red","blue"), lty=1:2, cex=0.6)

####AUC (area under curve)
AUC.tmp <- performance(forest_ROC_prediction,"auc") #Create AUC data
forest_AUC <- as.numeric(AUC.tmp@y.values) #Calculate AUC
forest_AUC #Display AUC value: 90+% - excellent, 80-90% - very good, 70-80% - good, 60-70% - so so, below 60% - not much value


######################################### TRAINING THE MODEL USING XGBOOST
library(xgboost)
training.x <-model.matrix(default.payment.next.month~ ., data = training)
testing.x <-model.matrix(default.payment.next.month~ ., data = testing)

model_XGboost<-xgboost(data = data.matrix(training.x[,-1:-3]),
                       label = as.numeric(as.character(training$default.payment.next.month)),
                       eta = 0.1,
                       max_depth = 20,
                       nround=50,
                       objective = "binary:logistic")

XGboost_prediction<-predict(model_XGboost,newdata=testing.x[,-1:-3], type="response") #Predict classification (for confusion matrix)
XGboost_prediction2<-as.factor(ifelse(XGboost_prediction>0.5,1,0))
confusionMatrix(XGboost_prediction2,testing$default.payment.next.month) #Display confusion matrix

####ROC Curve
XGboost_pred_testing <- prediction(XGboost_prediction, testing$default.payment.next.month) #Calculate errors
XGboost_ROC_testing <- performance(XGboost_pred_testing,"tpr","fpr") #Create ROC curve data
plot(XGboost_ROC_testing) #Plot ROC curve
plot(XGboost_ROC_testing, add=TRUE, col="green") #For comparison, overlay/add the ROC curve from (A) in red
legend("right", legend=c("Logistic","Random Forest","XGBoost"), col=c("red","blue","green"), lty=1:2, cex=0.6)
####AUC
auc.tmp <- performance(XGboost_pred_testing,"auc") #Create AUC data
XGboost_auc_testing <- as.numeric(auc.tmp@y.values) #Calculate AUC
XGboost_auc_testing #Display AUC value: 90+% - excellent, 80-90% - very good, 70-80% - good, 60-70% - so so, below 60% - not much value


######################TRAINING THE MODEL USING GRADIENT BOOSTING
library(gbm)
str(training)
training$default.payment.next.month<-as.character(training$default.payment.next.month)

model_ExtremeGradientBoosting<-gbm(default.payment.next.month~.-CurrentId,
                                   distribution="bernoulli",
                                   data=training,
                                   n.trees=1000,
                                   interaction.depth = 4,
                                   shrinkage = 0.01)

summary(model_ExtremeGradientBoosting)
GBM_prediction<-predict(model_ExtremeGradientBoosting,testing,n.trees = 1000,type = "response") #Predict classification (for confusion matrix)
GBM_prediction2<-as.factor(ifelse(GBM_prediction>0.5,1,0))
confusionMatrix(GBM_prediction2,testing$default.payment.next.month) #Display confusion matrix

####ROC Curve
GBM_pred_testing <- prediction(GBM_prediction, testing$default.payment.next.month) #Calculate errors
GBM_ROC_testing <- performance(GBM_pred_testing,"tpr","fpr") #Create ROC curve data
plot(GBM_ROC_testing) #Plot ROC curve
plot(GBM_ROC_testing, add=TRUE, col="green") #For comparison, overlay/add the ROC curve from (A) in red
legend("right", legend=c("Logistic","Random Forest","XGBoost"), col=c("red","blue","green"), lty=1:2, cex=0.6)
####AUC
auc.tmp <- performance(GBM_pred_testing,"auc") #Create AUC data
gbm_auc_testing <- as.numeric(auc.tmp@y.values) #Calculate AUC
gbm_auc_testing ##AUC Score WITHOUT Feature Engineering


plot(GBM_ROC_testing, add=TRUE, col="green") #For comparison, overlay/add the ROC curve from (A) in red
legend("right", legend=c("Without feature engineering"), col=c("green"), lty=1:2, cex=0.6)


#########################PREDICTING THE TWO NEW OBSERVATIONS
str(creditdata)
tail(creditdata)
CREDIT2<-subset(creditdata,CurrentId>15000)
tail(CREDIT2)

library(gbm)
str(training)
CREDIT1$default.payment.next.month<-as.character(CREDIT1$default.payment.next.month)

model_ExtremeGradientBoosting<-gbm(default.payment.next.month~.-CurrentId,
                                   distribution="bernoulli",
                                   data=CREDIT1,
                                   n.trees=1000,
                                   interaction.depth = 4,
                                   shrinkage = 0.01)
summary(model_ExtremeGradientBoosting)

GBM_prediction<-predict(model_ExtremeGradientBoosting,CREDIT2,n.trees = 1000,type = "response") #Predict classification (for confusion matrix)
GBM_prediction2<-as.factor(ifelse(GBM_prediction>0.5,1,0))

CREDIT2


str(CREDIT2)

str(creditdata)


##################################### PERFORMING FEATURE ENGINEERING

creditdata<-CREDIT1
str(creditdata)
creditdata$SEX<-as.factor(creditdata$SEX)
creditdata$EDUCATION<-as.factor(creditdata$EDUCATION)
creditdata$MARRIAGE<-as.factor(creditdata$MARRIAGE)

creditdata$PAY_0<-as.factor(creditdata$PAY_0)
creditdata$PAY_2<-as.factor(creditdata$PAY_2)
creditdata$PAY_3<-as.factor(creditdata$PAY_3)
creditdata$PAY_4<-as.factor(creditdata$PAY_4)
creditdata$PAY_5<-as.factor(creditdata$PAY_5)
creditdata$PAY_6<-as.factor(creditdata$PAY_6)
creditdata$LIMIT_BAL<-as.numeric(creditdata$LIMIT_BAL)
creditdata$BILL_AMT1<-as.numeric(creditdata$BILL_AMT1)
creditdata$BILL_AMT2<-as.numeric(creditdata$BILL_AMT2)
creditdata$BILL_AMT3<-as.numeric(creditdata$BILL_AMT3)
creditdata$BILL_AMT4<-as.numeric(creditdata$BILL_AMT4)
creditdata$BILL_AMT5<-as.numeric(creditdata$BILL_AMT5)
creditdata$BILL_AMT6<-as.numeric(creditdata$BILL_AMT6)
creditdata$PAY_AMT1<-as.numeric(creditdata$PAY_AMT1)
creditdata$PAY_AMT3<-as.numeric(creditdata$PAY_AMT3)
creditdata$PAY_AMT4<-as.numeric(creditdata$PAY_AMT4)
creditdata$PAY_AMT5<-as.numeric(creditdata$PAY_AMT5)
creditdata$AGE<-as.numeric(creditdata$AGE)
creditdata$default.payment.next.month<-as.factor(creditdata$default.payment.next.month)
str(creditdata)
table(creditdata$AGE)
sapply(creditdata, function(x)sum(is.na(x)))

names(creditdata)[1]<-"CurrentId"

summary(creditdata)
str(creditdata)
######################feature engineering for sex
#creditdata$SEX1<-as.numeric(creditdata$SEX)
#creditdata$SexLimit<-creditdata$LIMIT_BAL*creditdata$SEX1
#creditdata$Billamt1sex<-creditdata$BILL_AMT1*creditdata$SEX1
#creditdata$Billamt2sex<-creditdata$BILL_AMT2*creditdata$SEX1
#creditdata$Billamt3sex<-creditdata$BILL_AMT3*creditdata$SEX1
#creditdata$Billamt4sex<-creditdata$BILL_AMT4*creditdata$SEX1
#creditdata$Billamt5sex<-creditdata$BILL_AMT5*creditdata$SEX1
#creditdata$Billamt6sex<-creditdata$BILL_AMT6*creditdata$SEX1

#creditdata$PayAmt1sex<-creditdata$PAY_AMT1*creditdata$SEX1
#creditdata$PayAmt2sex<-creditdata$PAY_AMT2*creditdata$SEX1
#creditdata$PayAmt3sex<-creditdata$PAY_AMT3*creditdata$SEX1
#creditdata$PayAmt4sex<-creditdata$PAY_AMT4*creditdata$SEX1
#creditdata$PayAmt5sex<-creditdata$PAY_AMT5*creditdata$SEX1
#creditdata$PayAmt6sex<-creditdata$PAY_AMT6*creditdata$SEX1

######################feature engineering for education
#########creditdata$EDUCATION2<-creditdata$EDUCATION

##########creditdata$EDUCATION2<-as.character(creditdata$EDUCATION2)
###########creditdata[which(creditdata$EDUCATION2==0),"EDUCATION2"]<-4
#creditdata[which(creditdata$EDUCATION2==4),"EDUCATION2"]<-4
#creditdata[which(creditdata$EDUCATION2==5),"EDUCATION2"]<-4
#creditdata[which(creditdata$EDUCATION2==6),"EDUCATION2"]<-4
#creditdata$EDUCATION2<-as.factor(creditdata$EDUCATION2)

#creditdata$EDUCATION1<-as.numeric(creditdata$EDUCATION)
#creditdata$eduLimit<-creditdata$LIMIT_BAL*creditdata$EDUCATION1
#creditdata$Billamt1edu<-creditdata$BILL_AMT1*creditdata$EDUCATION1
#creditdata$Billamt2edu<-creditdata$BILL_AMT2*creditdata$EDUCATION1
#creditdata$Billamt3edu<-creditdata$BILL_AMT3*creditdata$EDUCATION1
#creditdata$Billamt4edu<-creditdata$BILL_AMT4*creditdata$EDUCATION1
#creditdata$Billamt5edu<-creditdata$BILL_AMT5*creditdata$EDUCATION1
#creditdata$Billamt6edu<-creditdata$BILL_AMT6*creditdata$EDUCATION1

#creditdata$PayAmt1edu<-creditdata$PAY_AMT1*creditdata$EDUCATION1
#creditdata$PayAmt2edu<-creditdata$PAY_AMT2*creditdata$EDUCATION1
#creditdata$PayAmt3edu<-creditdata$PAY_AMT3*creditdata$EDUCATION1
#creditdata$PayAmt4edu<-creditdata$PAY_AMT4*creditdata$EDUCATION1
#creditdata$PayAmt5edu<-creditdata$PAY_AMT5*creditdata$EDUCATION1
#creditdata$PayAmt6edu<-creditdata$PAY_AMT6*creditdata$EDUCATION1


######################feature engineering for MARRIAGE
#creditdata$MARRIAGE1<-as.numeric(creditdata$MARRIAGE)
#creditdata$MarriageLimit<-creditdata$LIMIT_BAL*creditdata$MARRIAGE1
#creditdata$Billamt1mARRIAGE<-creditdata$BILL_AMT1*creditdata$MARRIAGE1
#creditdata$Billamt2mARRIAGE<-creditdata$BILL_AMT2*creditdata$MARRIAGE1
#creditdata$Billamt3mARRIAGE<-creditdata$BILL_AMT3*creditdata$MARRIAGE1
#creditdata$Billamt4mARRIAGE<-creditdata$BILL_AMT4*creditdata$MARRIAGE1
#creditdata$Billamt5mARRIAGE<-creditdata$BILL_AMT5*creditdata$MARRIAGE1
#creditdata$Billamt6mARRIAGE<-creditdata$BILL_AMT6*creditdata$MARRIAGE1

#creditdata$PayAmt1MARRIAGE<-creditdata$PAY_AMT1*creditdata$MARRIAGE1
#creditdata$PayAmt2MARRIAGE<-creditdata$PAY_AMT2creditdata$MARRIAGE1
#creditdata$PayAmt3MARRIAGE<-creditdata$PAY_AMT3*creditdata$MARRIAGE1
#creditdata$PayAmt4MARRIAGE<-creditdata$PAY_AMT4*creditdata$MARRIAGE1
#creditdata$PayAmt5MARRIAGE<-creditdata$PAY_AMT5*creditdata$MARRIAGE1
#creditdata$PayAmt6MARRIAGE<-creditdata$PAY_AMT6*creditdata$MARRIAGE1

#######RUN THIS
creditdata$Month1<-(creditdata$BILL_AMT1-creditdata$BILL_AMT2)
creditdata$Month2<-(creditdata$BILL_AMT2-creditdata$BILL_AMT3)
creditdata$Month3<-(creditdata$BILL_AMT3-creditdata$BILL_AMT4)
creditdata$Month4<-(creditdata$BILL_AMT4-creditdata$BILL_AMT5)
creditdata$Month5<-(creditdata$BILL_AMT5-creditdata$BILL_AMT6)

creditdata$DiffPayAmt1<-(creditdata$PAY_AMT1-creditdata$PAY_AMT2)
creditdata$DiffPayAmt2<-(creditdata$PAY_AMT2-creditdata$PAY_AMT3)
creditdata$DiffPayAmt3<-(creditdata$PAY_AMT3-creditdata$PAY_AMT4)
creditdata$DiffPayAmt4<-(creditdata$PAY_AMT4-creditdata$PAY_AMT5)
creditdata$DiffPayAmt5<-(creditdata$PAY_AMT5-creditdata$PAY_AMT6)


str(creditdata)
creditdata$Month1BalanceProp<-max(creditdata$BILL_AMT1,0)/creditdata$LIMIT_BAL
creditdata$Month2BalanceProp<-max(creditdata$BILL_AMT2,0)/creditdata$LIMIT_BAL
creditdata$Month3BalanceProp<-max(creditdata$BILL_AMT3,0)/creditdata$LIMIT_BAL
creditdata$Month4BalanceProp<-max(creditdata$BILL_AMT4,0)/creditdata$LIMIT_BAL
creditdata$Month5BalanceProp<-max(creditdata$BILL_AMT5,0)/creditdata$LIMIT_BAL
creditdata$Month6BalanceProp<-max(creditdata$BILL_AMT6,0)/creditdata$LIMIT_BAL
creditdata$AggregateLimBalance<-max(creditdata$BILL_AMT1+creditdata$BILL_AMT2+
                                      creditdata$BILL_AMT3+creditdata$BILL_AMT4+
                                      creditdata$BILL_AMT5+creditdata$BILL_AMT6,0)/creditdata$LIMIT_BAL

#####RUN THIS
creditdata$PayAmtBalance1<-ifelse(creditdata$BILL_AMT1<=0,1,(creditdata$PAY_AMT1/creditdata$BILL_AMT1))
creditdata$PayAmtBalance2<-ifelse(creditdata$BILL_AMT2<=0,1,(creditdata$PAY_AMT2/creditdata$BILL_AMT2))
creditdata$PayAmtBalance3<-ifelse(creditdata$BILL_AMT3<=0,1,(creditdata$PAY_AMT3/creditdata$BILL_AMT3))
creditdata$PayAmtBalance4<-ifelse(creditdata$BILL_AMT4<=0,1,(creditdata$PAY_AMT4/creditdata$BILL_AMT4))
creditdata$PayAmtBalance5<-ifelse(creditdata$BILL_AMT5<=0,1,(creditdata$PAY_AMT5/creditdata$BILL_AMT5))
creditdata$PayAmtBalance6<-ifelse(creditdata$BILL_AMT6<=0,1,(creditdata$PAY_AMT6/creditdata$BILL_AMT6))
#####RUN THIS
creditdata$TotalBalancedue<-creditdata$BILL_AMT1+creditdata$BILL_AMT2+creditdata$BILL_AMT3+creditdata$BILL_AMT4+
  creditdata$BILL_AMT5+creditdata$BILL_AMT6
##########RUN THIS
creditdata$PayAmtdue<-creditdata$PAY_AMT1+creditdata$PAY_AMT2+creditdata$PAY_AMT3+creditdata$PAY_AMT4+
  creditdata$PAY_AMT5+creditdata$PAY_AMT6
#######RUN THIS
creditdata$Difference<-creditdata$TotalBalancedue-creditdata$PayAmtdue
creditdata$Difference1<-creditdata$LIMIT_BAL-creditdata$TotalBalancedue
creditdata$Difference2<-creditdata$LIMIT_BAL-creditdata$PayAmtdue
creditdata$TWOMONTHBILLAMNT1<-creditdata$BILL_AMT1+creditdata$BILL_AMT2
creditdata$TWOMONTHBILLAMNT2<-creditdata$BILL_AMT3+creditdata$BILL_AMT4
creditdata$TWOMONTHBILLAMNT3<-creditdata$BILL_AMT5+creditdata$BILL_AMT6
creditdata$TWOMONTHpayAMNT1<-creditdata$PAY_AMT1+creditdata$PAY_AMT2
creditdata$TWOMONTHpayAMNT2<-creditdata$PAY_AMT3+creditdata$PAY_AMT4
creditdata$TWOMONTHpayAMNT3<-creditdata$PAY_AMT5+creditdata$PAY_AMT6

creditdata$D1<-creditdata$BILL_AMT1-creditdata$PAY_AMT1
creditdata$D2<-creditdata$BILL_AMT2-creditdata$PAY_AMT2
creditdata$D3<-creditdata$BILL_AMT3-creditdata$PAY_AMT3
creditdata$D4<-creditdata$BILL_AMT4-creditdata$PAY_AMT4
creditdata$D5<-creditdata$BILL_AMT5-creditdata$PAY_AMT5
creditdata$D6<-creditdata$BILL_AMT6-creditdata$PAY_AMT6


creditdata$newdiff1<-creditdata$BILL_AMT1-creditdata$BILL_AMT3
creditdata$newdiff2<-creditdata$BILL_AMT3-creditdata$BILL_AMT5
creditdata$newdiff3<-creditdata$BILL_AMT2-creditdata$BILL_AMT4
creditdata$newdiff4<-creditdata$BILL_AMT4-creditdata$BILL_AMT6


creditdata$newpaydeiff1<-creditdata$PAY_AMT1-creditdata$PAY_AMT3
creditdata$newpaydeiff2<-creditdata$PAY_AMT3-creditdata$PAY_AMT5
creditdata$newpaydeiff3<-creditdata$PAY_AMT2-creditdata$PAY_AMT4
creditdata$newpaydeiff2<-creditdata$PAY_AMT4-creditdata$PAY_AMT6


#creditdata$extremedeiff1<-creditdata$BILL_AMT1-creditdata$BILL_AMT6
#creditdata$extremedeiff2<-creditdata$BILL_AMT1-creditdata$BILL_AMT5
#creditdata$extremedeiff3<-creditdata$BILL_AMT1-creditdata$BILL_AMT4
#creditdata$extremedeiff4<-creditdata$BILL_AMT1-creditdata$BILL_AMT3
#creditdata$extremedeiff5<-creditdata$BILL_AMT2-creditdata$BILL_AMT6
#creditdata$extremedeiff6<-creditdata$BILL_AMT2-creditdata$BILL_AMT5
#creditdata$extremedeiff7<-creditdata$BILL_AMT2-creditdata$BILL_AMT4
#creditdata$extremedeiff8<-creditdata$BILL_AMT3-creditdata$BILL_AMT6
#creditdata$extremedeiff9<-creditdata$BILL_AMT3-creditdata$BILL_AMT5
#creditdata$extremedeiff10<-creditdata$BILL_AMT4-creditdata$BILL_AMT6


#creditdata$extremediffpaydue<-creditdata$PAY_AMT1-creditdata$PAY_AMT6
#creditdata$extremediffpaydue2<-creditdata$PAY_AMT1-creditdata$PAY_AMT5
#creditdata$extremediffpaydue3<-creditdata$PAY_AMT1-creditdata$PAY_AMT4
#creditdata$extremediffpaydue4<-creditdata$PAY_AMT1-creditdata$PAY_AMT3
#creditdata$extremediffpaydue10<-creditdata$PAY_AMT2-creditdata$PAY_AMT6
#creditdata$extremediffpaydue5<-creditdata$PAY_AMT2-creditdata$PAY_AMT5
#creditdata$extremediffpaydue6<-creditdata$PAY_AMT2-creditdata$PAY_AMT4
#creditdata$extremediffpaydue7<-creditdata$PAY_AMT3-creditdata$PAY_AMT6
#creditdata$extremediffpaydue8<-creditdata$PAY_AMT3-creditdata$PAY_AMT5
#creditdata$extremediffpaydue9<-creditdata$PAY_AMT4-creditdata$PAY_AMT6


creditdata$ThreemonthBillAmtDue<-creditdata$BILL_AMT1+creditdata$BILL_AMT2+creditdata$BILL_AMT3
creditdata$ThreemonthBillAmtDue2<-creditdata$BILL_AMT4+creditdata$BILL_AMT5+creditdata$BILL_AMT6

creditdata$ThreemonthPayAmtDue<-creditdata$PAY_AMT1+creditdata$PAY_AMT2+creditdata$PAY_AMT3
creditdata$ThreemonthPayAmtDue2<-creditdata$PAY_AMT4+creditdata$PAY_AMT5+creditdata$PAY_AMT6


###############################CORRECTING PAY_0
#creditdata$PAY_00<-creditdata$PAY_0
#R1<-which(creditdata$PAY_00==4)
#R2<-which(creditdata$PAY_00==5)
#R3<-which(creditdata$PAY_00==6)
#R4<-which(creditdata$PAY_00==7)
#R5<-which(creditdata$PAY_00==8)
#R6<-which(creditdata$PAY_00==-2)
#R7<-which(creditdata$PAY_00==-1)
#R8<-which(creditdata$PAY_00==0)


#creditdata[R1,"PAY_00"]<-4
#creditdata[R2,"PAY_00"]<-4
#creditdata[R3,"PAY_00"]<-4
#creditdata[R4,"PAY_00"]<-4
#creditdata[R5,"PAY_00"]<-4
#creditdata[R6,"PAY_00"]<-0
#creditdata[R7,"PAY_00"]<-0
#creditdata[R8,"PAY_00"]<-0


###############################CORRECTING PAY_2
creditdata$PAY_02<-creditdata$PAY_2
R1<-which(creditdata$PAY_02==4)
R2<-which(creditdata$PAY_02==5)
R3<-which(creditdata$PAY_02==6)
R4<-which(creditdata$PAY_02==7)
R5<-which(creditdata$PAY_02==8)
R6<-which(creditdata$PAY_02==-2)
R7<-which(creditdata$PAY_02==-1)
R8<-which(creditdata$PAY_02==0)


creditdata[R1,"PAY_02"]<-4
creditdata[R2,"PAY_02"]<-4
creditdata[R3,"PAY_02"]<-4
creditdata[R4,"PAY_02"]<-4
creditdata[R5,"PAY_02"]<-4
creditdata[R6,"PAY_02"]<-0
creditdata[R7,"PAY_02"]<-0
creditdata[R8,"PAY_02"]<-0

###############################CORRECTING PAY_3
#creditdata$PAY_03<-creditdata$PAY_3
#R1<-which(creditdata$PAY_03==4)
#R2<-which(creditdata$PAY_03==5)
#R3<-which(creditdata$PAY_03==6)
#R4<-which(creditdata$PAY_03==7)
#R5<-which(creditdata$PAY_03==8)
#R6<-which(creditdata$PAY_03==-2)
#R7<-which(creditdata$PAY_03==-1)
#R8<-which(creditdata$PAY_03==0)


#creditdata[R1,"PAY_03"]<-4
#creditdata[R2,"PAY_03"]<-4
#creditdata[R3,"PAY_03"]<-4
#creditdata[R4,"PAY_03"]<-4
#creditdata[R5,"PAY_03"]<-4
#creditdata[R6,"PAY_03"]<-0
#creditdata[R7,"PAY_03"]<-0
#creditdata[R8,"PAY_03"]<-0

###############################CORRECTING PAY_5
creditdata$PAY_05<-creditdata$PAY_5
R1<-which(creditdata$PAY_05==4)
R2<-which(creditdata$PAY_05==5)
R3<-which(creditdata$PAY_05==6)
R4<-which(creditdata$PAY_05==7)
R5<-which(creditdata$PAY_05==8)
R6<-which(creditdata$PAY_05==-2)
R7<-which(creditdata$PAY_05==-1)
R8<-which(creditdata$PAY_05==0)


creditdata[R1,"PAY_05"]<-4
creditdata[R2,"PAY_05"]<-4
creditdata[R3,"PAY_05"]<-4
creditdata[R4,"PAY_05"]<-4
creditdata[R5,"PAY_05"]<-4
creditdata[R6,"PAY_05"]<-0
creditdata[R7,"PAY_05"]<-0
creditdata[R8,"PAY_05"]<-0


###############################CORRECTING PAY_6
creditdata$PAY_06<-creditdata$PAY_6
R1<-which(creditdata$PAY_06==4)
R2<-which(creditdata$PAY_06==5)
R3<-which(creditdata$PAY_06==6)
R4<-which(creditdata$PAY_06==7)
R5<-which(creditdata$PAY_06==8)
R6<-which(creditdata$PAY_06==-2)
R7<-which(creditdata$PAY_06==-1)
R8<-which(creditdata$PAY_06==0)


creditdata[R1,"PAY_06"]<-4
creditdata[R2,"PAY_06"]<-4
creditdata[R3,"PAY_06"]<-4
creditdata[R4,"PAY_06"]<-4
creditdata[R5,"PAY_06"]<-4
creditdata[R6,"PAY_06"]<-0
creditdata[R7,"PAY_06"]<-0
creditdata[R8,"PAY_06"]<-0


############CORRECTING EDUCATION
creditdata$EDUCATION2<-creditdata$EDUCATION
Row1<-which(creditdata$EDUCATION2==4)
Row2<-which(creditdata$EDUCATION2==5)
Row3<-which(creditdata$EDUCATION2==6)
Row4<-which(creditdata$EDUCATION2==0)

creditdata[Row1,"EDUCATION2"]<-0
creditdata[Row2,"EDUCATION2"]<-0
creditdata[Row3,"EDUCATION2"]<-0
creditdata[Row4,"EDUCATION2"]<-0


############CORRECTING MARRIAGE
creditdata$Mariage2<-creditdata$MARRIAGE
Row1<-which(creditdata$Mariage2==0)

creditdata[Row1,"Mariage2"]<-3

str(creditdata)
table(creditdata$AGE)


creditdata3<-creditdata

row1<-which(creditdata3$BILL_AMT1==0 & creditdata3$BILL_AMT2==0&creditdata3$BILL_AMT3==0&creditdata3$BILL_AMT4==0&
              creditdata3$BILL_AMT5==0&creditdata3$BILL_AMT6==0&creditdata3$default.payment.next.month==1)


creditdata3<-creditdata3[-row1,]
str(creditdata3)
str(creditdata)
############club 4, 5 and 6 in education as others and include 0 values
########### club 3 in marriage as others


sapply(training1, function(x) sum(is.na(x)))


hist(creditdata$Month2)


###################Creating Data Partition
set.seed(77800)
inTrain1 <- createDataPartition(y = creditdata3$default.payment.next.month,
                                p = 12500/14837, list = FALSE)
training <- creditdata3[ inTrain1,]
testing <- creditdata3[ -inTrain1,]

str(testing)

######################Extreme Gracdient Boosting with feature engineered variables
library(gbm)
str(training)
training$default.payment.next.month<-as.character(training$default.payment.next.month)


model_ExtremeGradientBoosting<-gbm(default.payment.next.month ~ .,
                                   distribution="bernoulli",
                                   data=training,
                                   n.trees=1000,
                                   interaction.depth = 5,
                                   shrinkage = 0.016)


summary(model_ExtremeGradientBoosting)
gbm_prediction<-predict(model_ExtremeGradientBoosting,testing,n.trees = 1000,type = "response") #Predict classification (for confusion matrix)
gbm_prediction2<-as.factor(ifelse(gbm_prediction>0.5,1,0))
confusionMatrix(gbm_prediction2,testing$default.payment.next.month) #Display confusion matrix

####ROC Curve
gbm_pred_testing <- prediction(gbm_prediction, testing$default.payment.next.month) #Calculate errors
gbm_ROC_testing <- performance(gbm_pred_testing,"tpr","fpr") #Create ROC curve data
plot(gbm_ROC_testing) #Plot ROC curve
plot(gbm_ROC_testing, add=TRUE, col="blue") #For comparison, overlay/add the ROC curve from (A) in red
legend("right", legend=c("Without feature engineering","With feature engineering"), col=c("green","blue"), lty=1:2, cex=0.6)
####AUC
auc.tmp <- performance(gbm_pred_testing,"auc") #Create AUC data
GBM_auc_testing <- as.numeric(auc.tmp@y.values) #Calculate AUC
GBM_auc_testing #D