-
-
Save mwp42/6f8fc34695e5c517dfc380d51aff87b1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Machine Learning Kaggle Project - Team Confidence Squared" | |
output: github_document | |
--- | |
```{r setup, include=FALSE} | |
knitr::opts_chunk$set(echo = TRUE) | |
``` | |
```{r} | |
# clear Workspace | |
rm(list = ls()) | |
library(dplyr) | |
library(data.table) | |
library(plyr) | |
library(knitr) | |
library(ggplot2) | |
library(corrplot) | |
library(caret) | |
library(gridExtra) | |
library(scales) | |
library(Rmisc) | |
library(ggrepel) | |
library(randomForest) | |
library(psych) | |
library(xgboost) | |
library(randomForest) | |
library(Metrics) | |
``` | |
## Part I: Preprocessing and EDA | |
#1.1 Load the datasets | |
```{r, echo=FALSE} | |
Train <- read.table("./train.csv",head = T, sep = ',') | |
Test <- read.table("./test.csv",head = T, sep=',') | |
``` | |
#1.2 combine train and test, check columns having missing values | |
```{r} | |
#Getting rid of the IDs but keeping the test IDs in a vector. These are needed to compose the submission file | |
test_labels <- Test$Id | |
Test$SalePrice <- NA | |
Train$Id <- NULL | |
Test$Id <- NULL | |
all <- rbind(Train, Test) | |
dim(all) | |
NAcol <- which(colSums(is.na(all)) > 0) | |
sort(colSums(sapply(all[NAcol], is.na)), decreasing = TRUE) | |
sapply(all[NAcol], class) | |
cat('There are', length(NAcol), 'columns with missing values') | |
``` | |
# misssing MasVnrArea and MasVnrType should be 0 and None | |
```{r} | |
Train$MasVnrType[is.na(Train$MasVnrArea)] | |
Test$MasVnrType[is.na(Test$MasVnrArea)] | |
``` | |
#1.3 Clean missing value for training data | |
#GarageYrBlt should be the same as YearBuild | |
#group1 missing NA value | |
#For the rest group2: | |
#Numeric value missing should be added 0 | |
#Factor value missing using the most frequent | |
```{r} | |
#add "None" as a level for group1 | |
addLevel <- function(x){ | |
if(is.factor(x)) return(factor(x, levels=c(levels(x), "None"))) | |
return(x) | |
} | |
#For the rest columns, qualitative assign the most frequest, quantitative assign 0 | |
missFun1 <- function(x) { | |
if (is.numeric(x)) { | |
x[is.na(x)] <- 0 | |
#mean(x, na.rm = TRUE) | |
x | |
} else { | |
x[is.na(x)] <- names(which.max(table(x))) | |
x | |
} | |
} | |
#Summarized function (missFun): fill missing and remove some columns | |
missFun <- function(t) { | |
#missing GarageYrBlt should equal to YearBuilt | |
t$GarageYrBlt[is.na(t$GarageYrBlt)] = t$YearBuilt[is.na(t$GarageYrBlt)] | |
# if the houses with veneer area NA are also NA in the veneer type, | |
# find the one that should have a MasVnrType, | |
# assign the veneer as the most popular (that is not none) | |
if(length(which(is.na(t$MasVnrType) & !is.na(t$MasVnrArea)))>0){ | |
t[is.na(t$MasVnrType) & !is.na(t$MasVnrArea),]$MasVnrType <- names(sort(-table(t$MasVnrType)))[2] | |
} #train | |
#For training set in several columns, missing value should assign "None" value as a category, using myFun1 | |
group1 <-t[,c("Alley","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","FireplaceQu","GarageType","GarageFinish","GarageQual","GarageCond","PoolQC","Fence","MiscFeature")] | |
group2 <-t[,!(colnames(t) %in% c("Alley","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","FireplaceQu","GarageType","GarageFinish","GarageQual","GarageCond","PoolQC","Fence","MiscFeature"))] | |
group1 <- as.data.frame(lapply(group1, addLevel)) | |
group1[is.na(group1)] <- "None" | |
group2=data.table(group2) | |
group2<-group2[, lapply(.SD, missFun1)] | |
#get the new training group without missing value | |
newt=cbind(group1,group2) | |
return(newt) | |
} | |
#Fill in missing data | |
Train$Id <- NULL | |
Test$SalePrice <- NULL | |
nTrain=missFun(Train) | |
nTest=missFun(Test) | |
``` | |
#1.4 Transform Existing variables | |
#check different typies of variables | |
```{r} | |
numericVars <- which(sapply(nTrain, is.numeric)) #index vector numeric variables | |
numericVarNames <- names(numericVars) #saving names vector numericVarNames for use later on | |
cat('There are', length(numericVars), 'numeric variables') | |
factorVars <- which(sapply(nTrain, is.factor)) #index vector factor variables | |
factorVarNames <- names(numericVars) #saving names vector factorVarNames for use later on | |
cat(' and there are', length(factorVars), 'factor variables in Train dataset') | |
``` | |
#Transform levels for variables shows different quality | |
#Transform MSSubClass into factor variable} | |
```{r} | |
#For the rest columns, qualitative assign the most frequest, quantitative assign 0 | |
transFun1 <- function(x) { | |
if (is.integer(x)) { | |
x<-as.integer(x) | |
x | |
} | |
else if (is.factor(x)) { | |
x<-as.factor(x) | |
x | |
} | |
x | |
} | |
# Defining levels (from data description) | |
Qual_lev <- c('None' = 0, 'Po' = 1, 'Fa' = 2, 'TA' = 3, 'Gd' = 4, 'Ex' = 5) | |
# for ExterQual, ExterCond, BsmtQual, BsmtCond, HeatingQC, KitchenQual,FireplaceQu, GarageQual, GarageCond, PoolQC | |
BsmtFin_lev <- c('None' = 0, 'Unf' = 1, 'LwQ' = 2, 'Rec' = 3, 'BLQ' = 4, 'ALQ' = 5, 'GLQ' = 6) | |
# for BsmtFinType1, BsmtFinType2 | |
BsmtExp_lev <- c('None' = 0, 'No' = 1, 'Mn' = 2, 'Av' = 3, 'Gd' = 4) | |
# for BsmtExposure | |
Func_lev <- c('Sal' = 0, 'Sev' = 1, 'Maj2' = 2, 'Maj1' = 3, 'Mod' = 4, 'Min2' = 5, 'Min1' = 6, 'Typ' = 7) | |
# for Functional | |
transFun <- function(t) { | |
col1 <- c('ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageCond', 'PoolQC') | |
col2 <- c('BsmtFinType1','BsmtFinType2','BsmtExposure','MiscFeature','Alley','Fence', 'GarageType','GarageFinish', 'Functional') | |
col3 <- c('MasVnrType','MSSubClass','YrSold','MoSold','YearBuilt','YearRemodAdd') | |
colT <- append(col1, col2) | |
colT <- append(colT, col3) | |
group1 <- subset(t, select=col1) #train | |
group2 <- subset(t, select=col2) #train | |
group3 <- subset(t, select=col3) #train | |
group4 <- t[,!(colnames(t) %in% colT)] #train | |
group1 <- as.data.frame( | |
lapply(group1, function(x) { | |
revalue(x,Qual_lev) | |
})) #train | |
group1<-lapply(group1, as.integer) | |
group2$BsmtFinType1 <- as.integer(revalue(group2$BsmtFinType1, BsmtFin_lev)) #train | |
group2$BsmtFinType2 <- as.integer(revalue(group2$BsmtFinType2, BsmtFin_lev)) #train | |
group2$BsmtExposure <- as.integer(revalue(group2$BsmtExposure, BsmtExp_lev)) #train | |
group2$MiscFeature <- as.factor(group2$MiscFeature) #train | |
group2$Alley <- as.factor(group2$Alley) #train | |
group2$Fence <- as.factor(group2$Fence) #train | |
group2$GarageType <- as.factor(group2$GarageType) #train | |
group2$GarageFinish <- as.factor(group2$GarageFinish) #train | |
group2$Functional <- as.integer(revalue(group2$Functional, Func_lev)) #train | |
group3<-lapply(group3, as.factor) | |
group4<-lapply(group4, transFun1) | |
newt <- cbind(group1,group2,group3, group4) #train | |
return(newt) | |
} | |
#Transform training set | |
nTrain=transFun(nTrain) | |
nTest=transFun(nTest) | |
``` | |
#1.4 Feature Engineering | |
#Function featureFun | |
```{r} | |
featureFun <- function(t) { | |
#calculate total bathrooms | |
t <- t %>% mutate( | |
TotalBath = FullBath + HalfBath*0.5 + BsmtFullBath + BsmtHalfBath*0.5) | |
#whether house is new or not | |
t <- t %>% mutate( | |
IsNew = ifelse(as.numeric(t$YrSold)==as.numeric(t$YearBuilt), 1, 0)) | |
t$IsNew <- as.factor(t$IsNew) | |
#0=No Remodeling, 1=Remodeling | |
t <- t %>% mutate( | |
Remodel = ifelse(as.numeric(t$YearBuilt)==as.numeric(t$YearRemodAdd), 0, 1)) | |
t$Remodel <- as.factor(t$Remodel) | |
#total age of the house after built/remodeled | |
t <- t %>% mutate( | |
Age = as.numeric(t$YrSold)-as.numeric(t$YearRemodAdd)) | |
#total square feet | |
t <- t %>% mutate( | |
TotalSqFeet = GrLivArea + TotalBsmtSF) | |
#Consolidating Porch variables | |
t <- t %>% mutate( | |
TotalPorchSF = OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch) | |
#did not check neighbourhood | |
} | |
nTrain<-featureFun(nTrain) | |
nTest<-featureFun(nTest) | |
``` | |
#check finalized variables | |
```{r} | |
#check different typies of variables for training | |
numericVars <- which(sapply(nTrain, is.numeric)) #index vector numeric variables | |
numericVarNames <- names(numericVars) #saving names vector numericVarNames for use later on | |
cat('There are', length(numericVars), 'numeric variables') | |
sapply(nTrain[numericVars], class) | |
factorVars <- which(sapply(nTrain, is.factor)) #index vector factor variables | |
factorVarNames <- names(numericVars) #saving names vector factorVarNames for use later on | |
cat(' and there are', length(factorVars), 'factor variables in Train dataset') | |
sapply(nTrain[factorVars], class) | |
``` | |
#1.5 Visualization of important variables | |
##Correlations check | |
Below I am checking the correlations again. As you can see, the number of variables with a correlation of at least 0.5 with the SalePrice has increased from 10 (see section 4.2.1) to 16. | |
```{r, out.width="100%"} | |
all_numVar <- nTrain[numericVars] | |
cor_numVar <- cor(all_numVar, use="pairwise.complete.obs") #correlations of all numeric variables | |
#sort on decreasing correlations with SalePrice | |
cor_sorted <- as.matrix(sort(cor_numVar[,'SalePrice'], decreasing = TRUE)) | |
#select only high corelations >0.5 | |
CorHigh <- names(which(apply(cor_sorted, 1, function(x) abs(x)>0.5))) | |
cor_numVar <- cor_numVar[CorHigh, CorHigh] | |
corrplot.mixed(cor_numVar, tl.col="black", tl.pos = "lt", tl.cex = 0.7,cl.cex = .7, number.cex=.7) | |
``` | |
##1.6 PreProcessing predictor variables | |
#Combine Test and Train and split numeric and factor; one hot encoding for factor variable | |
```{r} | |
nTrain1 = nTrain | |
nTest1 = nTest | |
nTest1$SalePrice <- NA | |
all = rbind(nTrain, nTest1) | |
DFnumeric <- all[, names(all) %in% numericVarNames] | |
DFfactors <- all[, !(names(all) %in% numericVarNames)] | |
DFfactors <- DFfactors[, names(DFfactors) != 'SalePrice'] | |
cat('There are', length(DFnumeric), 'numeric variables, and', length(DFfactors), 'factor variables') | |
``` | |
###Skewness and normalizing of the numeric predictors | |
#Transform variable into log value if it is skewed | |
```{r} | |
for(i in 1:ncol(DFnumeric)){ | |
if (abs(skew(DFnumeric[,i]))>0.8){ | |
DFnumeric[,i] <- log(DFnumeric[,i] +1) | |
} | |
} | |
``` | |
#Normalizing the data | |
```{r} | |
PreNum <- preProcess(DFnumeric, method=c("center", "scale")) | |
print(PreNum) | |
``` | |
```{r} | |
DFnorm <- predict(PreNum, DFnumeric) | |
dim(DFnorm) | |
``` | |
##One hot encoding the categorical variables | |
#Using the model.matrix() function to change factor variables into dummy variable do this one-hot encoding. | |
```{r} | |
DFdummies <- as.data.frame(model.matrix(~.-1, DFfactors)) | |
dim(DFdummies) | |
``` | |
#combining all (now numeric) predictors into one dataframe | |
```{r} | |
combined <- cbind(DFnorm, DFdummies) | |
``` | |
##Dealing with skewness of response variable | |
```{r} | |
skew(all$SalePrice) | |
``` | |
```{r} | |
qqnorm(all$SalePrice) | |
qqline(all$SalePrice) | |
``` | |
The skew of 1.87 indicates a right skew that is too high, and the Q-Q plot shows that sale prices are also not normally distributed. To fix this I am taking the log of SalePrice. | |
```{r} | |
all$SalePrice <- log(all$SalePrice) #there is no 0 so just take log | |
skew(all$SalePrice) | |
``` | |
#the Q-Q plot is looks much better now. | |
```{r} | |
qqnorm(all$SalePrice) | |
qqline(all$SalePrice) | |
``` | |
## 2 Modeling | |
```{r} | |
train1 <- combined[!is.na(all$SalePrice),] | |
train2 <- combined[!is.na(all$SalePrice),] | |
test1 <- combined[is.na(all$SalePrice),] | |
train1$SalePrice=NULL | |
``` | |
#2.1 Multi-linear model | |
```{r} | |
library(olsrr) | |
model <- lm(SalePrice ~ OverallQual+GrLivArea+GarageCars+TotalBath+GarageArea+TotalBsmtSF+FullBath+TotRmsAbvGrd+GarageYrBlt+Age+KitchenQual+BsmtQual+ExterQual, data = train2) | |
proc.time() | |
k <- ols_step_best_subset(model) | |
proc.time() | |
plot(k) | |
``` | |
```{r} | |
k | |
``` | |
```{r} | |
k1 <- ols_step_forward_p(model) #RMSE is 0.170 | |
``` | |
##2.2 ridge regression model | |
```{r} | |
set.seed(2018) | |
my_control <-trainControl(method="cv", number=5) | |
ridgeGrid <- expand.grid(alpha = 0, lambda = seq(0.001,0.1,by = 0.0005)) | |
#check time | |
ptm <- proc.time() | |
ridge_mod <- train(x=train1, y=all$SalePrice[!is.na(all$SalePrice)], method='glmnet', trControl= my_control, tuneGrid=ridgeGrid) | |
proc.time() - ptm | |
ridge_mod$bestTune | |
min(ridge_mod$results$RMSE) #RMSE is 0.1382241 | |
``` | |
```{r} | |
RidgePred <- predict(ridge_mod, test1) | |
pred_ridge <- exp(RidgePred) | |
``` | |
##2.3 lasso regression model | |
```{r} | |
set.seed(2018) | |
my_control <-trainControl(method="cv", number=5) | |
lassoGrid <- expand.grid(alpha = 1, lambda = seq(0.001,0.1,by = 0.0005)) | |
#check time | |
ptm <- proc.time() | |
lasso_mod <- train(x=train1, y=all$SalePrice[!is.na(all$SalePrice)], method='glmnet', trControl= my_control, tuneGrid=lassoGrid) | |
proc.time() - ptm | |
lasso_mod$bestTune | |
min(lasso_mod$results$RMSE) #RMSE is 0.1290906 | |
``` | |
```{r} | |
lassoVarImp <- varImp(lasso_mod,scale=F) | |
lassoImportance <- lassoVarImp$importance | |
varsSelected <- length(which(lassoImportance$Overall!=0)) | |
varsNotSelected <- length(which(lassoImportance$Overall==0)) | |
cat('Lasso uses', varsSelected, 'variables in its model, and did not select', varsNotSelected, 'variables.') | |
``` | |
```{r} | |
colnames(train1)[which(lassoImportance$Overall>0.05)] | |
lasso_numVar <- (train1)[which(lassoImportance$Overall>0.05)] | |
lasso_numVar$SalePrice=train2$SalePrice | |
lasso_numVar <- lasso_numVar %>% | |
select(SalePrice, everything()) | |
cor_lasso <- cor(lasso_numVar, use="pairwise.complete.obs") #correlations of all numeric variables | |
#sort on decreasing correlations with SalePrice | |
cor_lasso_sorted <- as.matrix(sort(cor_lasso[,'SalePrice'], decreasing = TRUE)) | |
#select only high corelations | |
# CorHigh <- names(which(apply(cor_lasso_sorted, 1, function(x) abs(x)>0.5))) | |
# cor_numVar <- cor_numVar[CorHigh, CorHigh] | |
corrplot.mixed(cor_lasso, tl.col="black", tl.pos = "lt", tl.cex = 0.7,cl.cex = .7, number.cex=.7) | |
``` | |
```{r} | |
LassoPred <- predict(lasso_mod, test1) | |
pred_lasso <- exp(LassoPred) | |
head(pred_lasso) | |
``` | |
##2.4 ElasticNet regression model | |
```{r} | |
#Elastic net is a regularized regression method that linearly combines the L1 and L2 penalties of the lasso and ridge methods. | |
set.seed(2018) | |
elnetGrid <- expand.grid(alpha = 0.5, lambda = seq(0.001,0.1,by = 0.0005)) | |
#check time | |
ptm <- proc.time() | |
elnet_mod <- train(x=train1, y=all$SalePrice[!is.na(all$SalePrice)], method='glmnet', trControl= my_control, tuneGrid=elnetGrid) | |
proc.time() - ptm | |
elnet_mod$bestTune | |
min(elnet_mod$results$RMSE) #RMSE is 0.1290153 | |
``` | |
```{r} | |
elnetPred <- predict(elnet_mod, test1) | |
pred_elnet <- exp(elnetPred) | |
head(pred_lasso) | |
``` | |
##2.5 Random Forest | |
#separate training and testing dataset from the available data | |
```{r} | |
library(caret) | |
x_train=train1 | |
y_train=all$SalePrice[!is.na(all$SalePrice)] | |
train=cbind(x_train,y_train) | |
names(train) <- make.names(names(train)) | |
set.seed(123) | |
partition <- createDataPartition(y=y_train, | |
p=.5, | |
list=F) | |
training <- train[partition,] | |
testing <- train[-partition,] | |
#check time | |
ptm <- proc.time() | |
model_1 <- randomForest(y_train~ ., data=training) | |
proc.time() - ptm | |
# Predict using the test set | |
prediction <- predict(model_1, testing) | |
model_output <- cbind(testing, prediction) | |
#Test with RMSE | |
rmse(model_output$y_train,model_output$prediction) #RMSE is 0.1561613 | |
``` | |
## 2.6 XGBoost model | |
```{r} | |
xgb_grid = expand.grid( | |
nrounds = 1000, | |
eta = c(0.1, 0.05, 0.01), | |
max_depth = c(2, 3, 4, 5, 6), | |
gamma = 0, | |
colsample_bytree=1, | |
min_child_weight=c(1, 2, 3, 4 ,5), | |
subsample=1 | |
) | |
``` | |
#Use 5 fold cross validation to find the best hyperparameter values | |
```{r} | |
xgb_caret <- train(x=train1, y=all$SalePrice[!is.na(all$SalePrice)], method='xgbTree', | |
trControl= my_control, tuneGrid=xgb_grid) | |
xgb_caret$bestTune #take 20+ minutes | |
``` | |
#The best result is using | |
#Max_depth=3 4 | |
#eta=0.05 0.01 | |
#gamme = 0 | |
#colsample_bytree = 1 | |
#Min_child_weight=4 3 | |
#subsample = 1 | |
```{r} | |
label_train <- all$SalePrice[!is.na(all$SalePrice)] | |
# separate training and test into two seperates Dmatrixs objects | |
dtrain <- xgb.DMatrix(data = as.matrix(train1), label= label_train) | |
test1$SalePrice<-NULL | |
dtest <- xgb.DMatrix(data = as.matrix(test1)) | |
``` | |
#apply the setting found earlier | |
```{r} | |
default_param<-list( | |
objective = "reg:linear", | |
booster = "gbtree", | |
eta=0.01, | |
gamma=0, | |
max_depth=4, | |
min_child_weight=3, | |
subsample=1, | |
colsample_bytree=1 | |
) | |
``` | |
#cross validation | |
```{r} | |
xgbcv <- xgb.cv( params = default_param, data = dtrain, nrounds = 1001, nfold = 5, showsd = T, stratified = T, print_every_n = 40, early_stopping_rounds = 10, maximize = F) | |
``` | |
#The best RMSE is 0.128208 at 961 round | |
```{r} | |
#train the model using the best iteration found by cross validation | |
#check time | |
ptm <- proc.time() | |
xgb_mod <- xgb.train(data = dtrain, params=default_param, nrounds = 961) | |
proc.time() - ptm | |
``` | |
```{r} | |
XGBpred <- predict(xgb_mod, dtest) | |
pred_XGB <- exp(XGBpred) | |
head(pred_XGB) | |
``` | |
#check out important variables | |
```{r} | |
library(Ckmeans.1d.dp) | |
mat <- xgb.importance (feature_names = colnames(train1),model = xgb_mod) | |
xgb.ggplot.importance(importance_matrix = mat[1:20], rel_to_first = TRUE) | |
``` | |
# 4 Stacking | |
```{r} | |
sub_data <- data.frame(Id = test_labels, SalePrice = (pred_XGB+pred_lasso+pred_elnet)/3) | |
sub_data_XGB <- data.frame(Id = test_labels, SalePrice = pred_XGB) | |
sub_data_lasso <- data.frame(Id = test_labels, SalePrice = pred_lasso) | |
sub_data_ridge <- data.frame(Id = test_labels, SalePrice = pred_ridge) | |
sub_data_elnet <- data.frame(Id = test_labels, SalePrice = pred_elnet) | |
head(sub_data) | |
write.csv(sub_data, file = 'sub_data_2.csv', row.names = F) | |
write.csv(sub_data_lasso, file = 'sub_data_lasso.csv', row.names = F) | |
write.csv(sub_data_XGB, file = 'sub_data_xbg.csv', row.names = F) | |
write.csv(sub_data_ridge, file = 'sub_data_ridge.csv', row.names = F) | |
write.csv(sub_data_elnet, file = 'sub_data_elnet.csv', row.names = F) | |
``` | |
``` | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment