mwp42/BlogPost_ML.Rmd Secret

## BlogPost_ML.Rmd
---
title: "Machine Learning Kaggle Project - Team Confidence Squared"
output: github_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
# clear Workspace
rm(list = ls())

library(dplyr)
library(data.table)
library(plyr)
library(knitr)
library(ggplot2)
library(corrplot)
library(caret)
library(gridExtra)
library(scales)
library(Rmisc)
library(ggrepel)
library(randomForest)
library(psych)
library(xgboost)
library(randomForest)
library(Metrics)

```

## Part I: Preprocessing and EDA
#1.1 Load the datasets


```{r, echo=FALSE}
Train <- read.table("./train.csv",head = T, sep = ',')
Test <- read.table("./test.csv",head = T, sep=',')

```


#1.2 combine train and test, check columns having missing values
```{r}
#Getting rid of the IDs but keeping the test IDs in a vector. These are needed to compose the submission file
test_labels <- Test$Id

Test$SalePrice <- NA
Train$Id <- NULL
Test$Id <- NULL
all <- rbind(Train, Test)
dim(all)

NAcol <- which(colSums(is.na(all)) > 0)
sort(colSums(sapply(all[NAcol], is.na)), decreasing = TRUE)
sapply(all[NAcol], class)
cat('There are', length(NAcol), 'columns with missing values')
```
# misssing MasVnrArea and MasVnrType should be 0 and None
```{r}
Train$MasVnrType[is.na(Train$MasVnrArea)]
Test$MasVnrType[is.na(Test$MasVnrArea)]
```

#1.3 Clean missing value for training data
#GarageYrBlt should be the same as YearBuild
#group1 missing NA value
#For the rest group2:
#Numeric value missing should be added 0
#Factor value missing using the most frequent

```{r}

#add "None" as a level for group1
addLevel <- function(x){
  if(is.factor(x)) return(factor(x, levels=c(levels(x), "None")))
  return(x)
}

#For the rest columns, qualitative assign the most frequest, quantitative assign 0
missFun1 <- function(x) {
  if (is.numeric(x)) {
    x[is.na(x)] <- 0
      #mean(x, na.rm = TRUE)
    x
  } else {
    x[is.na(x)] <- names(which.max(table(x)))
    x
  }
}

#Summarized function (missFun): fill missing and remove some columns
missFun <- function(t) {

#missing GarageYrBlt should equal to YearBuilt
t$GarageYrBlt[is.na(t$GarageYrBlt)] = t$YearBuilt[is.na(t$GarageYrBlt)]

# if the houses with veneer area NA are also NA in the veneer type,
  # find the one that should have a MasVnrType,
  # assign the veneer as the most popular (that is not none)
  if(length(which(is.na(t$MasVnrType) & !is.na(t$MasVnrArea)))>0){
    t[is.na(t$MasVnrType) & !is.na(t$MasVnrArea),]$MasVnrType <- names(sort(-table(t$MasVnrType)))[2]
  } #train

#For training set in several columns, missing value should assign "None" value as a category, using myFun1
group1 <-t[,c("Alley","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","FireplaceQu","GarageType","GarageFinish","GarageQual","GarageCond","PoolQC","Fence","MiscFeature")]

group2 <-t[,!(colnames(t) %in% c("Alley","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","FireplaceQu","GarageType","GarageFinish","GarageQual","GarageCond","PoolQC","Fence","MiscFeature"))]

group1 <- as.data.frame(lapply(group1, addLevel))

group1[is.na(group1)] <- "None"

group2=data.table(group2)
group2<-group2[, lapply(.SD, missFun1)]

#get the new training group without missing value
newt=cbind(group1,group2)

return(newt)

}

#Fill in missing data

Train$Id <- NULL
Test$SalePrice <- NULL
nTrain=missFun(Train)
nTest=missFun(Test)

```
#1.4 Transform Existing variables
#check different typies of variables
```{r}
numericVars <- which(sapply(nTrain, is.numeric)) #index vector numeric variables
numericVarNames <- names(numericVars) #saving names vector numericVarNames for use later on
cat('There are', length(numericVars), 'numeric variables')

factorVars <- which(sapply(nTrain, is.factor)) #index vector factor variables
factorVarNames <- names(numericVars) #saving names vector factorVarNames for use later on
cat(' and there are', length(factorVars), 'factor variables in Train dataset')

```
#Transform levels for variables shows different quality
#Transform MSSubClass into factor variable}

```{r}
#For the rest columns, qualitative assign the most frequest, quantitative assign 0
transFun1 <- function(x) {

  if (is.integer(x)) {
    x<-as.integer(x)
    x
    }

  else if (is.factor(x)) {
      x<-as.factor(x)
      x
  }
  x
}

# Defining levels (from data description)
Qual_lev <- c('None' = 0, 'Po' = 1, 'Fa' = 2, 'TA' = 3, 'Gd' = 4, 'Ex' = 5)
# for ExterQual, ExterCond, BsmtQual, BsmtCond, HeatingQC, KitchenQual,FireplaceQu, GarageQual, GarageCond, PoolQC
BsmtFin_lev <- c('None' = 0, 'Unf' = 1, 'LwQ' = 2, 'Rec' = 3, 'BLQ' = 4, 'ALQ' = 5, 'GLQ' = 6)
# for BsmtFinType1, BsmtFinType2
BsmtExp_lev <- c('None' = 0, 'No' = 1, 'Mn' = 2, 'Av' = 3, 'Gd' = 4)
# for BsmtExposure
Func_lev <- c('Sal' = 0, 'Sev' = 1, 'Maj2' = 2, 'Maj1' = 3, 'Mod' = 4, 'Min2' = 5, 'Min1' = 6, 'Typ' = 7)
# for Functional

transFun <- function(t) {

  col1 <- c('ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageCond', 'PoolQC')
  col2 <- c('BsmtFinType1','BsmtFinType2','BsmtExposure','MiscFeature','Alley','Fence', 'GarageType','GarageFinish', 'Functional')
  col3 <- c('MasVnrType','MSSubClass','YrSold','MoSold','YearBuilt','YearRemodAdd')
  colT <- append(col1, col2)
  colT <- append(colT, col3)

  group1 <- subset(t, select=col1) #train
  group2 <- subset(t, select=col2) #train
  group3 <- subset(t, select=col3) #train
  group4 <- t[,!(colnames(t) %in% colT)] #train

  group1 <- as.data.frame(
    lapply(group1, function(x) {
      revalue(x,Qual_lev)
    })) #train

  group1<-lapply(group1, as.integer)

  group2$BsmtFinType1 <- as.integer(revalue(group2$BsmtFinType1, BsmtFin_lev)) #train
  group2$BsmtFinType2 <- as.integer(revalue(group2$BsmtFinType2, BsmtFin_lev)) #train
  group2$BsmtExposure <- as.integer(revalue(group2$BsmtExposure, BsmtExp_lev)) #train
  group2$MiscFeature <- as.factor(group2$MiscFeature) #train
  group2$Alley <- as.factor(group2$Alley) #train
  group2$Fence <- as.factor(group2$Fence) #train
  group2$GarageType <- as.factor(group2$GarageType) #train
  group2$GarageFinish <- as.factor(group2$GarageFinish) #train
  group2$Functional <- as.integer(revalue(group2$Functional, Func_lev)) #train

  group3<-lapply(group3, as.factor)

  group4<-lapply(group4, transFun1)

  newt <- cbind(group1,group2,group3, group4) #train


  return(newt)
}

#Transform training set
nTrain=transFun(nTrain)

nTest=transFun(nTest)

```


#1.4 Feature Engineering
#Function featureFun

```{r}
featureFun <- function(t) {

  #calculate total bathrooms
  t <- t %>% mutate(
    TotalBath = FullBath + HalfBath*0.5 + BsmtFullBath + BsmtHalfBath*0.5)

  #whether house is new or not
  t <- t %>% mutate(
     IsNew = ifelse(as.numeric(t$YrSold)==as.numeric(t$YearBuilt), 1, 0))
  t$IsNew <- as.factor(t$IsNew)

  #0=No Remodeling, 1=Remodeling
  t <- t %>% mutate(
    Remodel = ifelse(as.numeric(t$YearBuilt)==as.numeric(t$YearRemodAdd), 0, 1))
  t$Remodel <- as.factor(t$Remodel)

  #total age of the house after built/remodeled
  t <- t %>% mutate(
    Age = as.numeric(t$YrSold)-as.numeric(t$YearRemodAdd))

  #total square feet
  t <- t %>% mutate(
    TotalSqFeet = GrLivArea + TotalBsmtSF)

  #Consolidating Porch variables
  t <- t %>% mutate(
    TotalPorchSF = OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch)

  #did not check neighbourhood

}
nTrain<-featureFun(nTrain)
nTest<-featureFun(nTest)
```
#check finalized variables

```{r}
#check different typies of variables for training

numericVars <- which(sapply(nTrain, is.numeric)) #index vector numeric variables
numericVarNames <- names(numericVars) #saving names vector numericVarNames for use later on
cat('There are', length(numericVars), 'numeric variables')
sapply(nTrain[numericVars], class)

factorVars <- which(sapply(nTrain, is.factor)) #index vector factor variables
factorVarNames <- names(numericVars) #saving names vector factorVarNames for use later on
cat(' and there are', length(factorVars), 'factor variables in Train dataset')
sapply(nTrain[factorVars], class)

```
#1.5 Visualization of important variables


##Correlations check

Below I am checking the correlations again. As you can see, the number of variables with a correlation of at least 0.5 with the SalePrice has increased from 10 (see section 4.2.1) to 16.

```{r, out.width="100%"}
all_numVar <- nTrain[numericVars]
cor_numVar <- cor(all_numVar, use="pairwise.complete.obs") #correlations of all numeric variables

#sort on decreasing correlations with SalePrice
cor_sorted <- as.matrix(sort(cor_numVar[,'SalePrice'], decreasing = TRUE))
 #select only high corelations >0.5
CorHigh <- names(which(apply(cor_sorted, 1, function(x) abs(x)>0.5)))
cor_numVar <- cor_numVar[CorHigh, CorHigh]

corrplot.mixed(cor_numVar, tl.col="black", tl.pos = "lt", tl.cex = 0.7,cl.cex = .7, number.cex=.7)
```
##1.6 PreProcessing predictor variables

#Combine Test and Train and split numeric and factor; one hot encoding for factor variable

```{r}
nTrain1 = nTrain
nTest1 = nTest
nTest1$SalePrice <- NA

all = rbind(nTrain, nTest1)

DFnumeric <- all[, names(all) %in% numericVarNames]

DFfactors <- all[, !(names(all) %in% numericVarNames)]

DFfactors <- DFfactors[, names(DFfactors) != 'SalePrice']

cat('There are', length(DFnumeric), 'numeric variables, and', length(DFfactors), 'factor variables')
```

###Skewness and normalizing of the numeric predictors

#Transform variable into log value if it is skewed

```{r}
for(i in 1:ncol(DFnumeric)){
        if (abs(skew(DFnumeric[,i]))>0.8){
                DFnumeric[,i] <- log(DFnumeric[,i] +1)
        }
}
```

#Normalizing the data
```{r}
PreNum <- preProcess(DFnumeric, method=c("center", "scale"))
print(PreNum)
```
```{r}
DFnorm <- predict(PreNum, DFnumeric)
dim(DFnorm)
```

##One hot encoding the categorical variables

#Using the model.matrix() function to change factor variables into dummy variable do this one-hot encoding.

```{r}

DFdummies <- as.data.frame(model.matrix(~.-1, DFfactors))
dim(DFdummies)
```

#combining all (now numeric) predictors into one dataframe
```{r}
combined <- cbind(DFnorm, DFdummies)
```

##Dealing with skewness of response variable

```{r}
skew(all$SalePrice)
```

```{r}
qqnorm(all$SalePrice)
qqline(all$SalePrice)
```

The skew of 1.87 indicates a right skew that is too high, and the Q-Q plot shows that sale prices are also not normally distributed. To fix this I am taking the log of SalePrice.

```{r}
all$SalePrice <- log(all$SalePrice) #there is no 0 so just take log
skew(all$SalePrice)
```

#the Q-Q plot is looks much better now.

```{r}
qqnorm(all$SalePrice)
qqline(all$SalePrice)
```
## 2 Modeling
```{r}
train1 <- combined[!is.na(all$SalePrice),]
train2 <- combined[!is.na(all$SalePrice),]
test1 <- combined[is.na(all$SalePrice),]

train1$SalePrice=NULL

```
#2.1 Multi-linear model
```{r}
library(olsrr)
model <- lm(SalePrice ~ OverallQual+GrLivArea+GarageCars+TotalBath+GarageArea+TotalBsmtSF+FullBath+TotRmsAbvGrd+GarageYrBlt+Age+KitchenQual+BsmtQual+ExterQual, data = train2)
proc.time()
k <- ols_step_best_subset(model)
proc.time()
plot(k)
```
```{r}
k
```

```{r}
 k1 <- ols_step_forward_p(model) #RMSE is 0.170
```

##2.2 ridge regression model
```{r}
set.seed(2018)
my_control <-trainControl(method="cv", number=5)
ridgeGrid <- expand.grid(alpha = 0, lambda = seq(0.001,0.1,by = 0.0005))
#check time
ptm <- proc.time()
ridge_mod <- train(x=train1, y=all$SalePrice[!is.na(all$SalePrice)], method='glmnet', trControl= my_control, tuneGrid=ridgeGrid)

proc.time() - ptm

ridge_mod$bestTune
min(ridge_mod$results$RMSE) #RMSE is 0.1382241

```


```{r}
RidgePred <- predict(ridge_mod, test1)
pred_ridge <- exp(RidgePred)

```
##2.3 lasso regression model
```{r}

set.seed(2018)
my_control <-trainControl(method="cv", number=5)
lassoGrid <- expand.grid(alpha = 1, lambda = seq(0.001,0.1,by = 0.0005))

#check time
ptm <- proc.time()

lasso_mod <- train(x=train1, y=all$SalePrice[!is.na(all$SalePrice)], method='glmnet', trControl= my_control, tuneGrid=lassoGrid)

proc.time() - ptm

lasso_mod$bestTune
min(lasso_mod$results$RMSE) #RMSE is 0.1290906
```


```{r}
lassoVarImp <- varImp(lasso_mod,scale=F)
lassoImportance <- lassoVarImp$importance

varsSelected <- length(which(lassoImportance$Overall!=0))
varsNotSelected <- length(which(lassoImportance$Overall==0))

cat('Lasso uses', varsSelected, 'variables in its model, and did not select', varsNotSelected, 'variables.')

```

```{r}
colnames(train1)[which(lassoImportance$Overall>0.05)]


lasso_numVar <- (train1)[which(lassoImportance$Overall>0.05)]
lasso_numVar$SalePrice=train2$SalePrice

lasso_numVar <- lasso_numVar %>%
  select(SalePrice, everything())

cor_lasso <- cor(lasso_numVar, use="pairwise.complete.obs") #correlations of all numeric variables

#sort on decreasing correlations with SalePrice
cor_lasso_sorted <- as.matrix(sort(cor_lasso[,'SalePrice'], decreasing = TRUE))
 #select only high corelations
# CorHigh <- names(which(apply(cor_lasso_sorted, 1, function(x) abs(x)>0.5)))
# cor_numVar <- cor_numVar[CorHigh, CorHigh]

corrplot.mixed(cor_lasso, tl.col="black", tl.pos = "lt", tl.cex = 0.7,cl.cex = .7, number.cex=.7)
```


```{r}
LassoPred <- predict(lasso_mod, test1)
pred_lasso <- exp(LassoPred)
head(pred_lasso)
```
##2.4 ElasticNet regression model
```{r}
#Elastic net is a regularized regression method that linearly combines the L1 and L2 penalties of the lasso and ridge methods.

set.seed(2018)

elnetGrid <- expand.grid(alpha = 0.5, lambda = seq(0.001,0.1,by = 0.0005))
#check time
ptm <- proc.time()

elnet_mod <- train(x=train1, y=all$SalePrice[!is.na(all$SalePrice)], method='glmnet', trControl= my_control, tuneGrid=elnetGrid)

proc.time() - ptm

elnet_mod$bestTune
min(elnet_mod$results$RMSE) #RMSE is 0.1290153
```

```{r}
elnetPred <- predict(elnet_mod, test1)
pred_elnet <- exp(elnetPred)
head(pred_lasso)
```

##2.5 Random Forest
#separate training and testing dataset from the available data
```{r}

library(caret)
x_train=train1
y_train=all$SalePrice[!is.na(all$SalePrice)]
train=cbind(x_train,y_train)
names(train) <- make.names(names(train))

set.seed(123)
partition <- createDataPartition(y=y_train,
                                 p=.5,
                                 list=F)


training <- train[partition,]
testing <- train[-partition,]
#check time
ptm <- proc.time()

model_1 <- randomForest(y_train~ ., data=training)
proc.time() - ptm

# Predict using the test set
prediction <- predict(model_1, testing)
model_output <- cbind(testing, prediction)

#Test with RMSE

rmse(model_output$y_train,model_output$prediction) #RMSE is 0.1561613
```

## 2.6 XGBoost model

```{r}
xgb_grid = expand.grid(
nrounds = 1000,
eta = c(0.1, 0.05, 0.01),
max_depth = c(2, 3, 4, 5, 6),
gamma = 0,
colsample_bytree=1,
min_child_weight=c(1, 2, 3, 4 ,5),
subsample=1
)
```

#Use 5 fold cross validation to find the best hyperparameter values

```{r}
xgb_caret <- train(x=train1, y=all$SalePrice[!is.na(all$SalePrice)], method='xgbTree',
                   trControl= my_control, tuneGrid=xgb_grid)
xgb_caret$bestTune #take 20+ minutes

```

#The best result is using
#Max_depth=3  4
#eta=0.05 0.01
#gamme = 0
#colsample_bytree = 1
#Min_child_weight=4 3
#subsample = 1

```{r}
label_train <- all$SalePrice[!is.na(all$SalePrice)]

# separate training and test into two seperates Dmatrixs objects
dtrain <- xgb.DMatrix(data = as.matrix(train1), label= label_train)
test1$SalePrice<-NULL
dtest <- xgb.DMatrix(data = as.matrix(test1))
```

#apply the setting found earlier

```{r}
default_param<-list(
        objective = "reg:linear",
        booster = "gbtree",
        eta=0.01,
        gamma=0,
        max_depth=4,
        min_child_weight=3,
        subsample=1,
        colsample_bytree=1
)
```

#cross validation

```{r}
xgbcv <- xgb.cv( params = default_param, data = dtrain, nrounds = 1001, nfold = 5, showsd = T, stratified = T, print_every_n = 40, early_stopping_rounds = 10, maximize = F)
```

#The best RMSE is 0.128208 at 961 round

```{r}
#train the model using the best iteration found by cross validation
#check time
ptm <- proc.time()

xgb_mod <- xgb.train(data = dtrain, params=default_param, nrounds = 961)

proc.time() - ptm

```

```{r}
XGBpred <- predict(xgb_mod, dtest)
pred_XGB <- exp(XGBpred)
head(pred_XGB)
```
#check out important variables
```{r}
library(Ckmeans.1d.dp)

mat <- xgb.importance (feature_names = colnames(train1),model = xgb_mod)
xgb.ggplot.importance(importance_matrix = mat[1:20], rel_to_first = TRUE)
```


# 4 Stacking

```{r}
sub_data <- data.frame(Id = test_labels, SalePrice = (pred_XGB+pred_lasso+pred_elnet)/3)
sub_data_XGB <- data.frame(Id = test_labels, SalePrice = pred_XGB)
sub_data_lasso <- data.frame(Id = test_labels, SalePrice = pred_lasso)
sub_data_ridge <- data.frame(Id = test_labels, SalePrice = pred_ridge)
sub_data_elnet <- data.frame(Id = test_labels, SalePrice = pred_elnet)

head(sub_data)
write.csv(sub_data, file = 'sub_data_2.csv', row.names = F)
write.csv(sub_data_lasso, file = 'sub_data_lasso.csv', row.names = F)
write.csv(sub_data_XGB, file = 'sub_data_xbg.csv', row.names = F)
write.csv(sub_data_ridge, file = 'sub_data_ridge.csv', row.names = F)
write.csv(sub_data_elnet, file = 'sub_data_elnet.csv', row.names = F)
```


```
	---
	title: "Machine Learning Kaggle Project - Team Confidence Squared"
	output: github_document
	---

	```{r setup, include=FALSE}
	knitr::opts_chunk$set(echo = TRUE)
	```

	```{r}
	# clear Workspace
	rm(list = ls())

	library(dplyr)
	library(data.table)
	library(plyr)
	library(knitr)
	library(ggplot2)
	library(corrplot)
	library(caret)
	library(gridExtra)
	library(scales)
	library(Rmisc)
	library(ggrepel)
	library(randomForest)
	library(psych)
	library(xgboost)
	library(randomForest)
	library(Metrics)

	```

	## Part I: Preprocessing and EDA
	#1.1 Load the datasets


	```{r, echo=FALSE}
	Train <- read.table("./train.csv",head = T, sep = ',')
	Test <- read.table("./test.csv",head = T, sep=',')

	```


	#1.2 combine train and test, check columns having missing values
	```{r}
	#Getting rid of the IDs but keeping the test IDs in a vector. These are needed to compose the submission file
	test_labels <- Test$Id

	Test$SalePrice <- NA
	Train$Id <- NULL
	Test$Id <- NULL
	all <- rbind(Train, Test)
	dim(all)

	NAcol <- which(colSums(is.na(all)) > 0)
	sort(colSums(sapply(all[NAcol], is.na)), decreasing = TRUE)
	sapply(all[NAcol], class)
	cat('There are', length(NAcol), 'columns with missing values')
	```
	# misssing MasVnrArea and MasVnrType should be 0 and None
	```{r}
	Train$MasVnrType[is.na(Train$MasVnrArea)]
	Test$MasVnrType[is.na(Test$MasVnrArea)]
	```

	#1.3 Clean missing value for training data
	#GarageYrBlt should be the same as YearBuild
	#group1 missing NA value
	#For the rest group2:
	#Numeric value missing should be added 0
	#Factor value missing using the most frequent

	```{r}

	#add "None" as a level for group1
	addLevel <- function(x){
	if(is.factor(x)) return(factor(x, levels=c(levels(x), "None")))
	return(x)
	}

	#For the rest columns, qualitative assign the most frequest, quantitative assign 0
	missFun1 <- function(x) {
	if (is.numeric(x)) {
	x[is.na(x)] <- 0
	#mean(x, na.rm = TRUE)
	x
	} else {
	x[is.na(x)] <- names(which.max(table(x)))
	x
	}
	}

	#Summarized function (missFun): fill missing and remove some columns
	missFun <- function(t) {

	#missing GarageYrBlt should equal to YearBuilt
	t$GarageYrBlt[is.na(t$GarageYrBlt)] = t$YearBuilt[is.na(t$GarageYrBlt)]

	# if the houses with veneer area NA are also NA in the veneer type,
	# find the one that should have a MasVnrType,
	# assign the veneer as the most popular (that is not none)
	if(length(which(is.na(t$MasVnrType) & !is.na(t$MasVnrArea)))>0){
	t[is.na(t$MasVnrType) & !is.na(t$MasVnrArea),]$MasVnrType <- names(sort(-table(t$MasVnrType)))[2]
	} #train

	#For training set in several columns, missing value should assign "None" value as a category, using myFun1
	group1 <-t[,c("Alley","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","FireplaceQu","GarageType","GarageFinish","GarageQual","GarageCond","PoolQC","Fence","MiscFeature")]

	group2 <-t[,!(colnames(t) %in% c("Alley","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","FireplaceQu","GarageType","GarageFinish","GarageQual","GarageCond","PoolQC","Fence","MiscFeature"))]

	group1 <- as.data.frame(lapply(group1, addLevel))

	group1[is.na(group1)] <- "None"

	group2=data.table(group2)
	group2<-group2[, lapply(.SD, missFun1)]

	#get the new training group without missing value
	newt=cbind(group1,group2)

	return(newt)

	}

	#Fill in missing data

	Train$Id <- NULL
	Test$SalePrice <- NULL
	nTrain=missFun(Train)
	nTest=missFun(Test)

	```
	#1.4 Transform Existing variables
	#check different typies of variables
	```{r}
	numericVars <- which(sapply(nTrain, is.numeric)) #index vector numeric variables
	numericVarNames <- names(numericVars) #saving names vector numericVarNames for use later on
	cat('There are', length(numericVars), 'numeric variables')

	factorVars <- which(sapply(nTrain, is.factor)) #index vector factor variables
	factorVarNames <- names(numericVars) #saving names vector factorVarNames for use later on
	cat(' and there are', length(factorVars), 'factor variables in Train dataset')

	```
	#Transform levels for variables shows different quality
	#Transform MSSubClass into factor variable}

	```{r}
	#For the rest columns, qualitative assign the most frequest, quantitative assign 0
	transFun1 <- function(x) {

	if (is.integer(x)) {
	x<-as.integer(x)
	x
	}

	else if (is.factor(x)) {
	x<-as.factor(x)
	x
	}
	x
	}

	# Defining levels (from data description)
	Qual_lev <- c('None' = 0, 'Po' = 1, 'Fa' = 2, 'TA' = 3, 'Gd' = 4, 'Ex' = 5)
	# for ExterQual, ExterCond, BsmtQual, BsmtCond, HeatingQC, KitchenQual,FireplaceQu, GarageQual, GarageCond, PoolQC
	BsmtFin_lev <- c('None' = 0, 'Unf' = 1, 'LwQ' = 2, 'Rec' = 3, 'BLQ' = 4, 'ALQ' = 5, 'GLQ' = 6)
	# for BsmtFinType1, BsmtFinType2
	BsmtExp_lev <- c('None' = 0, 'No' = 1, 'Mn' = 2, 'Av' = 3, 'Gd' = 4)
	# for BsmtExposure
	Func_lev <- c('Sal' = 0, 'Sev' = 1, 'Maj2' = 2, 'Maj1' = 3, 'Mod' = 4, 'Min2' = 5, 'Min1' = 6, 'Typ' = 7)
	# for Functional

	transFun <- function(t) {

	col1 <- c('ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageCond', 'PoolQC')
	col2 <- c('BsmtFinType1','BsmtFinType2','BsmtExposure','MiscFeature','Alley','Fence', 'GarageType','GarageFinish', 'Functional')
	col3 <- c('MasVnrType','MSSubClass','YrSold','MoSold','YearBuilt','YearRemodAdd')
	colT <- append(col1, col2)
	colT <- append(colT, col3)

	group1 <- subset(t, select=col1) #train
	group2 <- subset(t, select=col2) #train
	group3 <- subset(t, select=col3) #train
	group4 <- t[,!(colnames(t) %in% colT)] #train

	group1 <- as.data.frame(
	lapply(group1, function(x) {
	revalue(x,Qual_lev)
	})) #train

	group1<-lapply(group1, as.integer)

	group2$BsmtFinType1 <- as.integer(revalue(group2$BsmtFinType1, BsmtFin_lev)) #train
	group2$BsmtFinType2 <- as.integer(revalue(group2$BsmtFinType2, BsmtFin_lev)) #train
	group2$BsmtExposure <- as.integer(revalue(group2$BsmtExposure, BsmtExp_lev)) #train
	group2$MiscFeature <- as.factor(group2$MiscFeature) #train
	group2$Alley <- as.factor(group2$Alley) #train
	group2$Fence <- as.factor(group2$Fence) #train
	group2$GarageType <- as.factor(group2$GarageType) #train
	group2$GarageFinish <- as.factor(group2$GarageFinish) #train
	group2$Functional <- as.integer(revalue(group2$Functional, Func_lev)) #train

	group3<-lapply(group3, as.factor)

	group4<-lapply(group4, transFun1)

	newt <- cbind(group1,group2,group3, group4) #train


	return(newt)
	}

	#Transform training set
	nTrain=transFun(nTrain)

	nTest=transFun(nTest)

	```


	#1.4 Feature Engineering
	#Function featureFun

	```{r}
	featureFun <- function(t) {

	#calculate total bathrooms
	t <- t %>% mutate(
	TotalBath = FullBath + HalfBath0.5 + BsmtFullBath + BsmtHalfBath0.5)

	#whether house is new or not
	t <- t %>% mutate(
	IsNew = ifelse(as.numeric(t$YrSold)==as.numeric(t$YearBuilt), 1, 0))
	t$IsNew <- as.factor(t$IsNew)

	#0=No Remodeling, 1=Remodeling
	t <- t %>% mutate(
	Remodel = ifelse(as.numeric(t$YearBuilt)==as.numeric(t$YearRemodAdd), 0, 1))
	t$Remodel <- as.factor(t$Remodel)

	#total age of the house after built/remodeled
	t <- t %>% mutate(
	Age = as.numeric(t$YrSold)-as.numeric(t$YearRemodAdd))

	#total square feet
	t <- t %>% mutate(
	TotalSqFeet = GrLivArea + TotalBsmtSF)

	#Consolidating Porch variables
	t <- t %>% mutate(
	TotalPorchSF = OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch)

	#did not check neighbourhood

	}
	nTrain<-featureFun(nTrain)
	nTest<-featureFun(nTest)
	```
	#check finalized variables

	```{r}
	#check different typies of variables for training

	numericVars <- which(sapply(nTrain, is.numeric)) #index vector numeric variables
	numericVarNames <- names(numericVars) #saving names vector numericVarNames for use later on
	cat('There are', length(numericVars), 'numeric variables')
	sapply(nTrain[numericVars], class)

	factorVars <- which(sapply(nTrain, is.factor)) #index vector factor variables
	factorVarNames <- names(numericVars) #saving names vector factorVarNames for use later on
	cat(' and there are', length(factorVars), 'factor variables in Train dataset')
	sapply(nTrain[factorVars], class)

	```
	#1.5 Visualization of important variables


	##Correlations check

	Below I am checking the correlations again. As you can see, the number of variables with a correlation of at least 0.5 with the SalePrice has increased from 10 (see section 4.2.1) to 16.

	```{r, out.width="100%"}
	all_numVar <- nTrain[numericVars]
	cor_numVar <- cor(all_numVar, use="pairwise.complete.obs") #correlations of all numeric variables

	#sort on decreasing correlations with SalePrice
	cor_sorted <- as.matrix(sort(cor_numVar[,'SalePrice'], decreasing = TRUE))
	#select only high corelations >0.5
	CorHigh <- names(which(apply(cor_sorted, 1, function(x) abs(x)>0.5)))
	cor_numVar <- cor_numVar[CorHigh, CorHigh]

	corrplot.mixed(cor_numVar, tl.col="black", tl.pos = "lt", tl.cex = 0.7,cl.cex = .7, number.cex=.7)
	```
	##1.6 PreProcessing predictor variables

	#Combine Test and Train and split numeric and factor; one hot encoding for factor variable

	```{r}
	nTrain1 = nTrain
	nTest1 = nTest
	nTest1$SalePrice <- NA

	all = rbind(nTrain, nTest1)

	DFnumeric <- all[, names(all) %in% numericVarNames]

	DFfactors <- all[, !(names(all) %in% numericVarNames)]

	DFfactors <- DFfactors[, names(DFfactors) != 'SalePrice']

	cat('There are', length(DFnumeric), 'numeric variables, and', length(DFfactors), 'factor variables')
	```

	###Skewness and normalizing of the numeric predictors

	#Transform variable into log value if it is skewed

	```{r}
	for(i in 1:ncol(DFnumeric)){
	if (abs(skew(DFnumeric[,i]))>0.8){
	DFnumeric[,i] <- log(DFnumeric[,i] +1)
	}
	}
	```

	#Normalizing the data
	```{r}
	PreNum <- preProcess(DFnumeric, method=c("center", "scale"))
	print(PreNum)
	```
	```{r}
	DFnorm <- predict(PreNum, DFnumeric)
	dim(DFnorm)
	```

	##One hot encoding the categorical variables

	#Using the model.matrix() function to change factor variables into dummy variable do this one-hot encoding.

	```{r}

	DFdummies <- as.data.frame(model.matrix(~.-1, DFfactors))
	dim(DFdummies)
	```

	#combining all (now numeric) predictors into one dataframe
	```{r}
	combined <- cbind(DFnorm, DFdummies)
	```

	##Dealing with skewness of response variable

	```{r}
	skew(all$SalePrice)
	```

	```{r}
	qqnorm(all$SalePrice)
	qqline(all$SalePrice)
	```

	The skew of 1.87 indicates a right skew that is too high, and the Q-Q plot shows that sale prices are also not normally distributed. To fix this I am taking the log of SalePrice.

	```{r}
	all$SalePrice <- log(all$SalePrice) #there is no 0 so just take log
	skew(all$SalePrice)
	```

	#the Q-Q plot is looks much better now.

	```{r}
	qqnorm(all$SalePrice)
	qqline(all$SalePrice)
	```
	## 2 Modeling
	```{r}
	train1 <- combined[!is.na(all$SalePrice),]
	train2 <- combined[!is.na(all$SalePrice),]
	test1 <- combined[is.na(all$SalePrice),]

	train1$SalePrice=NULL

	```
	#2.1 Multi-linear model
	```{r}
	library(olsrr)
	model <- lm(SalePrice ~ OverallQual+GrLivArea+GarageCars+TotalBath+GarageArea+TotalBsmtSF+FullBath+TotRmsAbvGrd+GarageYrBlt+Age+KitchenQual+BsmtQual+ExterQual, data = train2)
	proc.time()
	k <- ols_step_best_subset(model)
	proc.time()
	plot(k)
	```
	```{r}
	k
	```

	```{r}
	k1 <- ols_step_forward_p(model) #RMSE is 0.170
	```

	##2.2 ridge regression model
	```{r}
	set.seed(2018)
	my_control <-trainControl(method="cv", number=5)
	ridgeGrid <- expand.grid(alpha = 0, lambda = seq(0.001,0.1,by = 0.0005))
	#check time
	ptm <- proc.time()
	ridge_mod <- train(x=train1, y=all$SalePrice[!is.na(all$SalePrice)], method='glmnet', trControl= my_control, tuneGrid=ridgeGrid)

	proc.time() - ptm

	ridge_mod$bestTune
	min(ridge_mod$results$RMSE) #RMSE is 0.1382241

	```


	```{r}
	RidgePred <- predict(ridge_mod, test1)
	pred_ridge <- exp(RidgePred)

	```
	##2.3 lasso regression model
	```{r}

	set.seed(2018)
	my_control <-trainControl(method="cv", number=5)
	lassoGrid <- expand.grid(alpha = 1, lambda = seq(0.001,0.1,by = 0.0005))

	#check time
	ptm <- proc.time()

	lasso_mod <- train(x=train1, y=all$SalePrice[!is.na(all$SalePrice)], method='glmnet', trControl= my_control, tuneGrid=lassoGrid)

	proc.time() - ptm

	lasso_mod$bestTune
	min(lasso_mod$results$RMSE) #RMSE is 0.1290906
	```


	```{r}
	lassoVarImp <- varImp(lasso_mod,scale=F)
	lassoImportance <- lassoVarImp$importance

	varsSelected <- length(which(lassoImportance$Overall!=0))
	varsNotSelected <- length(which(lassoImportance$Overall==0))

	cat('Lasso uses', varsSelected, 'variables in its model, and did not select', varsNotSelected, 'variables.')

	```

	```{r}
	colnames(train1)[which(lassoImportance$Overall>0.05)]


	lasso_numVar <- (train1)[which(lassoImportance$Overall>0.05)]
	lasso_numVar$SalePrice=train2$SalePrice

	lasso_numVar <- lasso_numVar %>%
	select(SalePrice, everything())

	cor_lasso <- cor(lasso_numVar, use="pairwise.complete.obs") #correlations of all numeric variables

	#sort on decreasing correlations with SalePrice
	cor_lasso_sorted <- as.matrix(sort(cor_lasso[,'SalePrice'], decreasing = TRUE))
	#select only high corelations
	# CorHigh <- names(which(apply(cor_lasso_sorted, 1, function(x) abs(x)>0.5)))
	# cor_numVar <- cor_numVar[CorHigh, CorHigh]

	corrplot.mixed(cor_lasso, tl.col="black", tl.pos = "lt", tl.cex = 0.7,cl.cex = .7, number.cex=.7)
	```


	```{r}
	LassoPred <- predict(lasso_mod, test1)
	pred_lasso <- exp(LassoPred)
	head(pred_lasso)
	```
	##2.4 ElasticNet regression model
	```{r}
	#Elastic net is a regularized regression method that linearly combines the L1 and L2 penalties of the lasso and ridge methods.

	set.seed(2018)

	elnetGrid <- expand.grid(alpha = 0.5, lambda = seq(0.001,0.1,by = 0.0005))
	#check time
	ptm <- proc.time()

	elnet_mod <- train(x=train1, y=all$SalePrice[!is.na(all$SalePrice)], method='glmnet', trControl= my_control, tuneGrid=elnetGrid)

	proc.time() - ptm

	elnet_mod$bestTune
	min(elnet_mod$results$RMSE) #RMSE is 0.1290153
	```

	```{r}
	elnetPred <- predict(elnet_mod, test1)
	pred_elnet <- exp(elnetPred)
	head(pred_lasso)
	```

	##2.5 Random Forest
	#separate training and testing dataset from the available data
	```{r}

	library(caret)
	x_train=train1
	y_train=all$SalePrice[!is.na(all$SalePrice)]
	train=cbind(x_train,y_train)
	names(train) <- make.names(names(train))

	set.seed(123)
	partition <- createDataPartition(y=y_train,
	p=.5,
	list=F)


	training <- train[partition,]
	testing <- train[-partition,]
	#check time
	ptm <- proc.time()

	model_1 <- randomForest(y_train~ ., data=training)
	proc.time() - ptm

	# Predict using the test set
	prediction <- predict(model_1, testing)
	model_output <- cbind(testing, prediction)

	#Test with RMSE

	rmse(model_output$y_train,model_output$prediction) #RMSE is 0.1561613
	```

	## 2.6 XGBoost model

	```{r}
	xgb_grid = expand.grid(
	nrounds = 1000,
	eta = c(0.1, 0.05, 0.01),
	max_depth = c(2, 3, 4, 5, 6),
	gamma = 0,
	colsample_bytree=1,
	min_child_weight=c(1, 2, 3, 4 ,5),
	subsample=1
	)
	```

	#Use 5 fold cross validation to find the best hyperparameter values

	```{r}
	xgb_caret <- train(x=train1, y=all$SalePrice[!is.na(all$SalePrice)], method='xgbTree',
	trControl= my_control, tuneGrid=xgb_grid)
	xgb_caret$bestTune #take 20+ minutes

	```

	#The best result is using
	#Max_depth=3 4
	#eta=0.05 0.01
	#gamme = 0
	#colsample_bytree = 1
	#Min_child_weight=4 3
	#subsample = 1

	```{r}
	label_train <- all$SalePrice[!is.na(all$SalePrice)]

	# separate training and test into two seperates Dmatrixs objects
	dtrain <- xgb.DMatrix(data = as.matrix(train1), label= label_train)
	test1$SalePrice<-NULL
	dtest <- xgb.DMatrix(data = as.matrix(test1))
	```

	#apply the setting found earlier

	```{r}
	default_param<-list(
	objective = "reg:linear",
	booster = "gbtree",
	eta=0.01,
	gamma=0,
	max_depth=4,
	min_child_weight=3,
	subsample=1,
	colsample_bytree=1
	)
	```

	#cross validation

	```{r}
	xgbcv <- xgb.cv( params = default_param, data = dtrain, nrounds = 1001, nfold = 5, showsd = T, stratified = T, print_every_n = 40, early_stopping_rounds = 10, maximize = F)
	```

	#The best RMSE is 0.128208 at 961 round

	```{r}
	#train the model using the best iteration found by cross validation
	#check time
	ptm <- proc.time()

	xgb_mod <- xgb.train(data = dtrain, params=default_param, nrounds = 961)

	proc.time() - ptm

	```

	```{r}
	XGBpred <- predict(xgb_mod, dtest)
	pred_XGB <- exp(XGBpred)
	head(pred_XGB)
	```
	#check out important variables
	```{r}
	library(Ckmeans.1d.dp)

	mat <- xgb.importance (feature_names = colnames(train1),model = xgb_mod)
	xgb.ggplot.importance(importance_matrix = mat[1:20], rel_to_first = TRUE)
	```


	# 4 Stacking

	```{r}
	sub_data <- data.frame(Id = test_labels, SalePrice = (pred_XGB+pred_lasso+pred_elnet)/3)
	sub_data_XGB <- data.frame(Id = test_labels, SalePrice = pred_XGB)
	sub_data_lasso <- data.frame(Id = test_labels, SalePrice = pred_lasso)
	sub_data_ridge <- data.frame(Id = test_labels, SalePrice = pred_ridge)
	sub_data_elnet <- data.frame(Id = test_labels, SalePrice = pred_elnet)

	head(sub_data)
	write.csv(sub_data, file = 'sub_data_2.csv', row.names = F)
	write.csv(sub_data_lasso, file = 'sub_data_lasso.csv', row.names = F)
	write.csv(sub_data_XGB, file = 'sub_data_xbg.csv', row.names = F)
	write.csv(sub_data_ridge, file = 'sub_data_ridge.csv', row.names = F)
	write.csv(sub_data_elnet, file = 'sub_data_elnet.csv', row.names = F)
	```


	```