Skip to content

Instantly share code, notes, and snippets.

@gauravgola96
Created January 28, 2018 10:08
Show Gist options
  • Save gauravgola96/d453d9dcbddedd99d8aa867e600944f0 to your computer and use it in GitHub Desktop.
Save gauravgola96/d453d9dcbddedd99d8aa867e600944f0 to your computer and use it in GitHub Desktop.
Loan prediction (Analytics Vidhya)
train = read.csv("C:\\Users\\Gaurav_Gola\\Desktop\\project\\loan prediction\\train.csv",na.strings = c(""," ",NA))
test = read.csv("C:\\Users\\Gaurav_Gola\\Desktop\\project\\loan prediction\\test.csv",na.strings = c(""," ",NA))
library(mlr)
summarizeColumns(train)
summarizeColumns(test)
#Data visualization
# for Character
for(i in colnames(train)){
if(i != "Loan_ID"){
for(j in colnames(test)){
if(i==j){
if(is.factor(train[,i])){
par(mfrow=c(1,2))
barplot(table(train[,i]),main =paste("train",i))
barplot(table(test[,j]),main=paste("test",j))
#print(paste("train",i))
print(c(paste("train",i),prop.table(table(train[,i]))))
#print(paste("test",j))
print(c(paste("test",i),prop.table(table(test[,j]))))
}
}
}
}
}
#for numeric
for(i in colnames(train)){
if(i != "Loan_Amount_Term"){
for(j in colnames(test)){
if(i==j){
if(is.numeric(train[,i])){
par(mfrow=c(1,2))
boxplot(train[,i],main =paste("train",i))
boxplot(test[,j],main=paste("test",j))
}
}
}
}
}
unique(train$Loan_Amount_Term)
par(mfrow=c(1,2))
hist(train$Loan_Amount_Term)
hist(test$Loan_Amount_Term)
## Credit History
## unique(train$Credit_History)
#it should be in factor
train$Credit_History = as.factor(train$Credit_History)
test$Credit_History = as.factor(test$Credit_History)
par(mfrow=c(1,2))
barplot(table(train$Credit_History),main="train credit history")
barplot(table(test$Credit_History),main="test credit history")
prop.table(table(train$Credit_History))
prop.table(table(test$Credit_History))
#### Loan_Status by other variables
## for character
for(i in colnames(train)){
library(ggplot2)
if(i!=c("Loan_ID","Loan_Amount_Term","Loan_Status")){
if(is.factor(train[,i])){
par(mfrow=c(1,3))
print(ggplot(train, aes(x=train$Loan_Status))+geom_bar()+facet_grid(.~train[,i])+ggtitle(i))
}
}
}
# for numerical variable relation b/t dependent var(Loan status)
for(i in colnames(train)){
library(ggplot2)
if(i!= c("Loan_ID","Loan_Amount_Term","Loan_Status")){
if(is.numeric(train[,i])){
par(mfrow=c(1,3))
print(ggplot(train,aes(x=train$Loan_Status,y=train[,i]))+geom_boxplot()+ggtitle(i))
}
}
}
#MIssing values imputation
full_data = rbind(train[,2:12],test[2:12])
#The first variables I will deal with are Applicant Income and Coapplicant Income.
#Some of the applicants are males, so, presumably, the coapplicants are female and vice versa.
library(ggplot2)
#Applicants with higher than 20000 income have been truncated from the plot
print(ggplot(data=full_data[full_data$ApplicantIncome<20000,],aes(ApplicantIncome,fill=Married))+geom_bar(position="dodge")+facet_grid(Gender~.))
#No difference observed
# so check in coapplicant income
print(ggplot(data=full_data[full_data$ApplicantIncome<20000,],aes(CoapplicantIncome,fill=Married))+geom_bar(position="dodge")+facet_grid(Gender~.))
# coapplicant income :-
# male married - high income
# male unmarried - low income (almost zero)
#for most of the male unmarried applicants, the coapplicant has zero income or low income in general,
#though there are a few cases where the coapplicant has high income.
# female married - low income
#Again more female applicants are not married ,
#and for almost all of these applicants the coapplicant has zero income. I assume this means there is no coapplicant
#in male married -- coapplicant zero or low
# in female inmarried - copplicant zero or low
library(plyr)
full_data2<-mutate(full_data,TotalIncome=ApplicantIncome+CoapplicantIncome)
print(ggplot(data=full_data2,aes(TotalIncome,fill=Married))+geom_bar(position="dodge")
+facet_grid(Gender~.))
## Furthermore, it seems reasonable to impute marital status as
#"No" when the coapplicant income is zero, and "Yes", otherwise.
full_data2$Married[is.na(full_data2$Married) & full_data2$CoapplicantIncome==0]<-"No"
full_data2$Married[is.na(full_data2$Married)]<- "Yes"
prop.table(table(full_data2$Married))
#Gender and dependents
any(is.na(full_data2$Gender))
any(is.na(full_data2$Dependents))
# checking missing values with condition:
full_data2[is.na(full_data2$Gender) & is.na(full_data2$Dependents),]
#This applicant is not married but has higher income than the coapplicant. I'll impute this one as "Male".
full_data2$Gender[is.na(full_data2$Gender) & is.na(full_data2$Dependents)] <- "Male"
print(ggplot(full_data2,aes(x=Dependents, fill=Gender)) + geom_bar() + facet_grid(.~Married))
#It looks safe to impute the number of dependents for the unmarried males and females as the mode=0.
#The mode for the married applicants is also zero, but the other values are more significant
#than in the unmarried case. All the missing ones are male applicants. I will use rpart to predict the
#number of dependents for this population, using applicant income,coapplicant income, loan amount,
#loan term and property area as predcitors.
# imputing for unmarried
full_data2$Dependents[is.na(full_data2$Dependents) & full_data2$Married=="No"]= "0"
# for married we will do rpart
subset = full_data2[(full_data2$Gender=="Male" & full_data2$Married=="Yes"),c(3,6:9,11)]
subtrain=subset[!is.na(subset$Dependents),]
subtest= subset[is.na(subset$Dependents),]
library(rpart)
library(rattle)
depFit <- rpart(data=subtrain,Dependents~.,xval=3)
fancyRpartPlot(depFit)
p=predict(depFit,subtrain,type="class")
p
acc = sum(p==subtrain[,1])/length(p)
acc
full_data2$Dependents[is.na(full_data2$Dependents) & full_data2$Gender=="Male" & full_data2$Married == "Yes"]= predict(depFit,newdata=subtest,type="class")
# for missing genders
#rpart
gendertrain = full_data2[!is.na(full_data2$Gender),1:7]
gendertest<-full_data2[is.na(full_data2$Gender),1:7]
genFit<-rpart(data=gendertrain,Gender~.,xval=3)
fancyRpartPlot(genFit)
p = predict(genFit,gendertrain,type="class")
acc<-sum(p==gendertrain[,1])/length(p)
acc
full_data2$Gender[is.na(full_data2$Gender)]=predict(genFit,gendertest,type="class")
# Self employment missing values on the basis of mode #prop.table#
full_data2$Self_Employed[is.na(full_data$Self_Employed)] = "No"
#Credit history
# Credit history. I think this variable should be treated carefully. If the credit history is not
#available, this means that the applicant has not had many credit activities in the past,
#so these applicants should be treated as a separate category. Recoding:
library(car)
full_data2$Credit_History<-recode(full_data2$Credit_History,"NA=2")
#Loan Amount
#logistic regression
ltrain = full_data2[!is.na(full_data2$LoanAmount) & full_data2$LoanAmount<500,c(1:8,10)]
ltest = full_data2[is.na(full_data2$LoanAmount),c(1:8,10)]
loanFit = glm(data=ltrain,LoanAmount~.,na.action=na.exclude)
#impute
full_data2$LoanAmount[is.na(full_data2$LoanAmount)] <- predict(loanFit,newdata=ltest)
#loan amount term
full_data2$Loan_Amount_Term <- as.factor(full_data2$Loan_Amount_Term)
full_data2$Loan_Amount_Term[is.na(full_data2$Loan_Amount_Term)]<-"360"
full_data2$Loan_Amount_Term <- recode(full_data2$Loan_Amount_Term,"'350'='360';'6'='60'")
alldata2 = full_data2
####################################################################
numDependents <- recode(alldata2$Dependents,"'3+'='3' ")
numDependents <- as.numeric(as.character(numDependents))
alldata2$FamilySize <- ifelse((alldata2$CoapplicantIncome>0 |alldata2$Married=="Y"),numDependents+2,numDependents+1)
alldata2$IncomePC <- alldata2$TotalIncome/alldata2$FamilySize
alldata2$LoanAmountByTotInc <- alldata2$LoanAmount/alldata2$TotalIncome
alldata2$LoanAmountPC <- alldata2$LoanAmount/alldata2$IncomePC
alldata2$Loan_Amount_Term <- as.numeric(as.character(alldata2$Loan_Amount_Term))
alldata2$LoanPerMonth <- alldata2$LoanAmount/alldata2$Loan_Amount_Term
alldata2$LoanPerMOnthByTotInc <- alldata2$LoanPerMonth/alldata2$TotalIncome
alldata2$LoanPerMonthPC <- alldata2$LoanPerMonth/alldata2$LoanAmountPC
#make loan term variable factor again
alldata2$Loan_Amount_Term <- as.factor(alldata2$Loan_Amount_Term)
logbins<-cut(ifelse(alldata2$ApplicantIncome<2.72,0,log(alldata2$ApplicantIncome)),breaks=20)
alldata2$LogApplicantIncome <- ifelse(alldata2$ApplicantIncome<2.72,0,log(alldata2$ApplicantIncome))
alldata2$LogCoapplicantIncome <- ifelse(alldata2$CoapplicantIncome<2.72,0,log(alldata2$CoapplicantIncome))
summary(alldata2$LoanAmount)
alldata2$LogLoanAmount <- log(alldata2$LoanAmount)
summary(alldata2$TotalIncome)
alldata2$LogTotalIncome <- log(alldata2$TotalIncome)
summary(alldata2$IncomePC)
alldata2$IncomePC <- log(alldata2$IncomePC)
summary(alldata2$LoanAmountByTotInc)
alldata2$LogLoanAmountPC <- log(1000*alldata2$LoanAmountPC)
alldata2$LogLoanPerMOnth <- log(alldata2$LoanPerMonth)
alldata2$LogLoanPerMOnthPC <- log(alldata2$LoanPerMonthPC)
nums <- sapply(alldata2,class)=="numeric"
numvars <- alldata2[,nums]
m<-cor(numvars)
v<-as.vector(m)
id1<- rep(rownames(m),17)
id2<-as.vector(sapply(rownames(m),function(x)rep(x,17)))
d<-data.frame(v,id1,id2)
d<-d[d$v>0.8 & d$v<1,]
d
d<-d[c(1:5,8),]
d
newtrain <- cbind(Loan_Status=train$Loan_Status,alldata2[1:614,])
#bogus Loan status for test set
Loan_Status <- as.factor(sample(c("N","Y"),replace=TRUE,size=dim(test)[1]))
newtest <- cbind(Loan_Status,alldata2[615:981,])
#create task
trainTask <- makeClassifTask(data = newtrain,target = "Loan_Status")
testTask <- makeClassifTask(data = newtest, target = "Loan_Status")
#normalize the variables
trainTask <- normalizeFeatures(trainTask,method = "standardize")
testTask <- normalizeFeatures(testTask,method = "standardize")
###################################################################
rf <- makeLearner("classif.randomForest", predict.type = "response"
, par.vals = list(ntree = 200, mtry = 3))
rf$par.vals <- list(importance = TRUE)
#set tunable parameters
rf_param <- makeParamSet(
makeIntegerParam("ntree",lower = 50, upper = 500),
makeIntegerParam("mtry", lower = 2, upper = 10),
makeIntegerParam("nodesize", lower = 10, upper = 50)
)
#let's do random search for 100 iterations
rancontrol <- makeTuneControlRandom(maxit = 100L)
#set 3 fold cross validation
set_cv <- makeResampleDesc("CV",iters = 3L)
#hypertuning
set.seed(11)
rf_tune <- tuneParams(learner = rf, resampling = set_cv, task = trainTask, par.set = rf_param, control = rancontrol)
#cv accuracy
rf_tune$y
rf_tune$x
#using hyperparameters for modeling
tunedrf <- setHyperPars(rf, par.vals = rf_tune$x)
#train a model
rforest <- train(tunedrf, trainTask)
getLearnerModel(rforest)
#make predictions
rfmodel <- predict(rforest, testTask)
#submission file
submit2 <- data.frame(Loan_ID = test$Loan_ID, Loan_Status = rfmodel$data$response)
# write.csv(submit2, "sol2.csv",row.names = F)
submit<-cbind(submit1$Loan_Status,submit2$Loan_Status)
sum(submit[,1]==submit[,2])
LP = write.csv(submit2,file = "LP.csv")
getwd()
################
tree <- makeLearner("classif.rpart", predict.type = "response")
#set 3 fold cross validation
set_cv <- makeResampleDesc("CV",iters = 3L)
#Search for hyperparameters
treepars <- makeParamSet(
makeIntegerParam("minsplit",lower = 10, upper = 50),
makeIntegerParam("minbucket", lower = 5, upper = 50),
makeNumericParam("cp", lower = 0.001, upper = 0.2)
)
#try 100 different combinations of values
tpcontrol <- makeTuneControlRandom(maxit = 100L)
#hypertune the parameters
rm(acc)
set.seed(11)
treetune <- tuneParams(learner = tree, resampling = set_cv,
task = trainTask, par.set = treepars, control = tpcontrol, measures = acc)
treetune
#using hyperparameters for modeling
tunedtree <- setHyperPars(tree, par.vals=treetune$x)
#train the model
treefit <- train(tunedtree, trainTask)
par(mfrow=c(1,1))
fancyRpartPlot(getLearnerModel(treefit))
#make predictions
treepred <- predict(treefit, testTask)
#create a submission file
submit1 <- data.frame(Loan_ID = test$Loan_ID, Loan_Status = treepred$data$response)
write.csv(submit1, "sol1.csv",row.names = F)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment