Skip to content

Instantly share code, notes, and snippets.

@AzharuddinKazi
Created February 10, 2017 16:14
Show Gist options
  • Save AzharuddinKazi/b9f87f9ef1b47dba04bf3430bfc9e607 to your computer and use it in GitHub Desktop.
Save AzharuddinKazi/b9f87f9ef1b47dba04bf3430bfc9e607 to your computer and use it in GitHub Desktop.
implementation of k nearest neighbour and then applying PCA for dimentionality reduction
#### problem statement: given data about different customers, we have to classify prospective loan takes, i.e, classify loan takes and non-loan takers
### reading from a dataset named Universal Bank
data<-read.csv("E:\\UniversalBank.csv",header=T)
data1=subset(data, select=-c(ID,ZIP.Code))
str(data1)
## segregating the categorical and numeric variables
name<-c("Education","Securities.Account","CD.Account",
"Online","CreditCard")
name1<-c("Age","Experience","Income","Family",
"CCAvg","Mortgage","Personal.Loan")
# fetching only the categorical variables
data_cat<-data1[which(colnames(data1) %in% name)]
data_cat<-data.frame(apply(data_cat,2,as.character))
# converting the categorical variables to dummy variables to apply the model
library(dummies)
data_dummy<-as.data.frame(apply(data_cat,2,FUN=dummy))
### we have to rename the colums because above function name the colunms that are difficult to understand
names(data_dummy)<-c("Edu1","Edu2","Edu3","Securities1",
"Securities1",
"CDAccount1","CDAccount2","Online1",
"Online2","CreditCard1","CreditCard2")
# fetching the numerical variables
data_num<-data1[(which(colnames(data1) %in% name1))]
# combining all the data to get a final processed data.
# as data_num also contains the target variable, here we are selecting only the first six colums and we will join the target variable as last column
data_combined<-cbind(data_num[1:6],data_dummy,data_num[7])
#Dividing the data into train and test- Data Without Standardizing
rows<-seq(1,nrow(data_combined),1)
set.seed(1234)
trainrows<-sample(rows,0.7*nrow(data_combined))
train<-data_combined[trainrows,]
test<-data_combined[-trainrows,]
rm(data_cat,data_num,data_dummy,name,name1)
#KNN on un-standardized data
library(class)
pred=knn(train[1:17],test[1:17], train$Personal.Loan,k=3)
a=table(test$Personal.Loan,pred)
a
##Perfoming PCA on the data
pca<-princomp(train[1:18])
summary(pca)
pca_test<-as.data.frame(predict(pca,test[1:18]))
pca_train<-as.data.frame(predict(pca,train[1:18]))
pred_pca=knn(pca_train[1:4],pca_test[1:4], train$Personal.Loan, k = 3)
b=table(test$Personal.Loan,pred_pca)
b
###Standardization and its impact
library(vegan)
data2<-decostand(data_combined[1:17],method="standardize")
train1<-data2[trainrows,]
test1<-data2[-trainrows,]
pred3=knn(train1,test1, train$Personal.Loan, k = 3)
c=table(test$Personal.Loan,pred3)
c
pca1<-prcomp(train1)
summary(pca1)
pca_test1<-as.data.frame(predict(pca1,test1))
pca_train1<-as.data.frame(predict(pca1,train1))
pred4=knn(pca_train1[1:10],pca_test1[1:10], train$Personal.Loan, k = 3)
a=table(test$Personal.Loan,pred4)
a
##We have data_combined which is non-standardised
#we split this data into train and test
pca2<-prcomp(train[1:17],scale=T)
pca_test2<-as.data.frame(predict(pca2,test[1:17]))
pca_train2<-as.data.frame(predict(pca2,train[1:17]))
summary(pca2)
pred5=knn(pca_train2[1:10],pca_test2[1:10], train$Personal.Loan, k = 3)
d=table(test$Personal.Loan,pred5)
d #print d
keep=condense(train1, train[,18])
keep
pred=knn(train1[keep, , drop=FALSE], test1, train$Personal.Loan[keep],k=10)
a <- table(pred,test$Personal.Loan)
a #print a
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment