Created
January 29, 2018 10:23
-
-
Save gauravgola96/c356acf7b2ae0bdd0673d8e5d303f43e to your computer and use it in GitHub Desktop.
House price prediction using Xgboost
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# House prize prediction | |
train = read.csv("train.csv",stringsAsFactors = F) | |
test = read.csv("test.csv",stringsAsFactors = F) | |
#checking the levels of variables in test and train datasets , they should be equal | |
#checking character variables | |
charatr=sapply(train,is.character) | |
new.char.dataset=train[,charatr] | |
# checking if all are good to convert in factor | |
for(i in 1:ncol(new.char.dataset)){ | |
print(unique(new.char.dataset[,i])) | |
} | |
#now back to main datasets train & test | |
colSums(is.na(train)) | |
colSums(is.na(test)) | |
# Remove the target variable not found in test set | |
SalePrice = train$SalePrice | |
train$SalePrice = NULL | |
# Combine data sets | |
full_data = rbind(train,test) | |
#for loop for imputing missing values and converting characters to factors at a time | |
for(col in 1:80){ | |
if(class(full_data[,col])=="character"){ | |
new_col = full_data[,col] | |
new_col[which(is.na(new_col),T)]="missing" | |
full_data[col] = as.factor(new_col) | |
} | |
} | |
#now the levels of train & test datasets will be same | |
# Separate out our train and test sets | |
train = full_data[1:nrow(train),] | |
train$SalePrice = SalePrice | |
test = full_data[(nrow(train)+1):nrow(full_data),] | |
summary(train) | |
# imputing negative value for missing numerical values | |
# # Fill remaining NA values with -1 | |
# We will be using a tree-based model in this example so the scale of our numbers shouldn't affect | |
#our model and assigning the NA's to -1 will essentially allow -1 to act as a numeric flag for NA values. | |
#If we were using a model that scales numeric variables by a learned parameter like linear regression, | |
#we might want to use a different solution such as imputing missing values and we'd also want to consider | |
#centering, scaling and normalizing the numeric features so that they are on the same scale and have | |
#distributions that are roughly normal. | |
train[is.na(train)] = -1 | |
test[is.na(test)] = -1 | |
#checking correlation in train data | |
#high correlation cor>0.5 | |
for(col in colnames(train)){ | |
if(is.numeric(train[,col])){ | |
if(abs(cor(train[,col],train$SalePrice))>0.5){ | |
print(col) | |
print(abs(cor(train[,col],train$SalePrice))) | |
} | |
} | |
} | |
# low correaltion cor<0.1 | |
for(col in colnames(train)){ | |
if(is.numeric(train[,col])){ | |
if(abs(cor(train[,col],train$SalePrice))<0.1){ | |
print(col) | |
print(cor(train[,col],train$SalePrice)) | |
} | |
} | |
} | |
# checking multicollinearity b/w any TWO numerical values | |
#cors = cor(train[ , sapply(train, is.numeric)]) | |
#high_cor = which(abs(cors) > 0.6 & (abs(cors) < 1)) | |
#rows = rownames(cors)[((high_cor-1) %/% 38)+1] | |
#cols = colnames(cors)[ifelse(high_cor %% 38 == 0, 38, high_cor %% 38)] | |
#vals = cors[high_cor] | |
#cor_data = data.frame(cols=cols, rows=rows, correlation=vals) | |
#cor_data | |
#Multicollinearity | |
# dependent variable not needed b/c correlation with that is already checked | |
SalePrice = train$SalePrice | |
train$SalePrice = NULL | |
for(i in colnames(train)){ | |
if(is.numeric(train[,i])){ | |
for(j in colnames(train)){ | |
if(is.numeric(train[,j])){ | |
if(i!=j){ | |
cor = cor(train[,i],train[,j]) | |
if(cor>0.6 & cor<1){ | |
print(c(i,j,cor)) | |
} | |
} | |
} | |
} | |
} | |
} | |
train$SalePrice = SalePrice # adding back the dependent variable | |
# plotting density plot to check normality | |
for (col in colnames(train)){ | |
if(is.numeric(train[,col])){ | |
plot(density(train[,col]), main=col) | |
} | |
} | |
###################### | |
#prediction!!! | |
# Add variable that combines above grade living area with basement sq footage | |
train$total_sq_footage = train$GrLivArea + train$TotalBsmtSF | |
test$total_sq_footage = test$GrLivArea + test$TotalBsmtSF | |
# Add variable that combines above ground and basement full and half baths | |
train$total_baths = train$BsmtFullBath + train$FullBath + (0.5 * (train$BsmtHalfBath + train$HalfBath)) | |
test$total_baths = test$BsmtFullBath + test$FullBath + (0.5 * (test$BsmtHalfBath + test$HalfBath)) | |
# Remove Id since it should have no value in prediction | |
train$Id = NULL | |
test$Id = NULL | |
library(caret) | |
library(plyr) | |
library(xgboost) | |
library(Metrics) | |
#Next let's create the control object and tuning variable grid we need to pass to our | |
#caret model. The target metric used to judge this competition is root mean squared logarithmic | |
#error or RMSLE. Caret optimizes root mean squared error for regression by default, so if we want | |
#to optimize for RMSLE we should pass in a custom summary function via our caret control object. | |
#The R package "Metrics" has a function for computing RMSLE so we can use that to compute the performance | |
#metric inside our custom summary function. | |
# Create custom summary function in proper format for caret | |
custom_summary = function(data, lev = NULL, model = NULL){ | |
out = rmsle(data[, "obs"], data[, "pred"]) | |
names(out) = c("rmsle") | |
out | |
} | |
# Create control object | |
control = trainControl(method = "cv", # Use cross validation | |
number = 5, # 5-folds | |
summaryFunction = custom_summary | |
) | |
# Create grid of tuning parameters | |
grid = expand.grid(nrounds=c(100, 200, 400, 800), # Test 4 values for boosting rounds | |
max_depth= c(4, 6), # Test 2 values for tree depth | |
eta=c(0.1, 0.05, 0.025), # Test 3 values for learning rate | |
gamma= c(0.1), | |
colsample_bytree = c(1), | |
min_child_weight = c(1), | |
subsample=0.7) | |
set.seed(12) | |
xgb_tree_model = train(SalePrice~., # Predict SalePrice using all features | |
data=train, | |
method="xgbTree", | |
trControl=control, | |
tuneGrid=grid, | |
metric="rmsle", # Use custom performance metric | |
maximize = FALSE) # Minimize the metric | |
xgb_tree_model$results | |
xgb_tree_model$bestTune | |
varImp(xgb_tree_model) | |
test_predictions = predict(xgb_tree_model, newdata=test) | |
test_predictions | |
submission = read.csv("sample_submission.csv") | |
submission$SalePrice = test_predictions | |
write.csv(submission, "home_prices_xgb_sub1.csv", row.names=FALSE) | |
getwd() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment