Skip to content

Instantly share code, notes, and snippets.

@yabyzq
Created July 10, 2017 10:46
Show Gist options
  • Save yabyzq/aef92cdc432e5086c00b2d75af884dde to your computer and use it in GitHub Desktop.
Save yabyzq/aef92cdc432e5086c00b2d75af884dde to your computer and use it in GitHub Desktop.
Ensembling
library(caret)
set.seed(1)
data<-read.csv(url('https://datahack-prod.s3.ap-south-1.amazonaws.com/train_file/train_u6lujuX_CVtuZ9i.csv'))
preProcValues <- preProcess(data, method = c("medianImpute","center","scale"))
library('RANN')
data_processed <- predict(preProcValues, data)
index <- createDataPartition(data_processed$Loan_Status, p=0.75, list=FALSE)
trainSet <- data_processed[ index,]
testSet <- data_processed[-index,]
#Defining the training controls for multiple models
fitControl <- trainControl(method = "cv", number = 5, savePredictions = 'final', classProbs = T)
#Defining the predictors and outcome
predictors<-c("Credit_History", "LoanAmount", "Loan_Amount_Term", "ApplicantIncome",
"CoapplicantIncome")
outcomeName<-'Loan_Status'
#build rf
model_rf<-train(trainSet[,predictors],trainSet[,outcomeName],method='rf',trControl=fitControl,tuneLength=3)
testSet$pred_rf<-predict(object = model_rf,testSet[,predictors])
confusionMatrix(testSet$Loan_Status,testSet$pred_rf)
#build knn
model_knn<-train(trainSet[,predictors],trainSet[,outcomeName],method='knn',trControl=fitControl,tuneLength=3)
testSet$pred_knn<-predict(object = model_knn,testSet[,predictors])
confusionMatrix(testSet$Loan_Status,testSet$pred_knn)
#build lr
model_lr<-train(trainSet[,predictors],trainSet[,outcomeName],method='glm',trControl=fitControl,tuneLength=3)
testSet$pred_lr<-predict(object = model_lr,testSet[,predictors])
confusionMatrix(testSet$Loan_Status,testSet$pred_lr)
#Avg
testSet$pred_rf_prob<-predict(object = model_rf,testSet[,predictors],type='prob')
testSet$pred_knn_prob<-predict(object = model_knn,testSet[,predictors],type='prob')
testSet$pred_lr_prob<-predict(object = model_lr,testSet[,predictors],type='prob')
testSet$pred_avg<-(testSet$pred_rf_prob$Y+testSet$pred_knn_prob$Y+testSet$pred_lr_prob$Y)/3
testSet$pred_avg<-as.factor(ifelse(testSet$pred_avg>0.5,'Y','N'))
confusionMatrix(testSet$Loan_Status,testSet$pred_avg)
#Majority Voting
testSet$pred_majority<-as.factor(ifelse(testSet$pred_rf=='Y' & testSet$pred_knn=='Y','Y',ifelse(testSet$pred_rf=='Y'
& testSet$pred_lr=='Y','Y',ifelse(testSet$pred_knn=='Y' & testSet$pred_lr=='Y','Y','N'))))
confusionMatrix(testSet$Loan_Status,testSet$pred_majority)
#Ranking
#Defining the training control
fitControl <- trainControl(
method = "cv",
number = 10,
savePredictions = 'final', # To save out of fold predictions for best parameter combinantions
classProbs = T # To save the class probabilities of the out of fold predictions
)
#Defining the predictors and outcome
predictors<-c("Credit_History", "LoanAmount", "Loan_Amount_Term", "ApplicantIncome",
"CoapplicantIncome")
outcomeName<-'Loan_Status'
#Training the random forest model
model_rf<-train(trainSet[,predictors],trainSet[,outcomeName],method='rf',trControl=fitControl,tuneLength=3)
model_knn<-train(trainSet[,predictors],trainSet[,outcomeName],method='knn',trControl=fitControl,tuneLength=3)
model_lr<-train(trainSet[,predictors],trainSet[,outcomeName],method='glm',trControl=fitControl,tuneLength=3)
#Predicting the out of fold prediction probabilities for training data
trainSet$OOF_pred_rf<-model_rf$pred$Y[order(model_rf$pred$rowIndex)]
trainSet$OOF_pred_knn<-model_knn$pred$Y[order(model_knn$pred$rowIndex)]
trainSet$OOF_pred_lr<-model_lr$pred$Y[order(model_lr$pred$rowIndex)]
#Predicting probabilities for the test data
testSet$OOF_pred_rf<-predict(model_rf,testSet[predictors],type='prob')$Y
testSet$OOF_pred_knn<-predict(model_knn,testSet[predictors],type='prob')$Y
testSet$OOF_pred_lr<-predict(model_lr,testSet[predictors],type='prob')$Y
#Predictors for top layer models
predictors_top<-c('OOF_pred_rf','OOF_pred_knn','OOF_pred_lr')
#GBM as top layer model
model_gbm<-
train(trainSet[,predictors_top],trainSet[,outcomeName],method='gbm',trControl=fitControl,tuneLength=3)
testSet$gbm_stacked<-predict(model_gbm,testSet[,predictors_top])
confusionMatrix(testSet$Loan_Status,testSet$gbm_stacked)
model_glm<-
train(trainSet[,predictors_top],trainSet[,outcomeName],method='glm',trControl=fitControl,tuneLength=3)
testSet$glm_stacked<-predict(model_glm,testSet[,predictors_top])
confusionMatrix(testSet$Loan_Status,testSet$glm_stacked)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment