Skip to content

Instantly share code, notes, and snippets.

@daxaxelrod
Last active December 9, 2016 21:37
Show Gist options
  • Save daxaxelrod/433ea07ebbf6dbcc8a6562189b3a2ab5 to your computer and use it in GitHub Desktop.
Save daxaxelrod/433ea07ebbf6dbcc8a6562189b3a2ab5 to your computer and use it in GitHub Desktop.
library(ggplot2)
library(lattice)
library(caret)
library(dplyr)
train <- read.csv(file="housing_prices/train.csv", header = TRUE, sep = ",")
test <- read.csv(file="housing_prices/test.csv", header = TRUE, sep = ",")
dim(train)
summary(train)
names(train)
dim(test)
# If you are interested in a variable use some of the analyses below to see its distribution, outliers, etc.
cor(train$SalePrice, train$Fireplaces)
summary(aov(SalePrice ~ CentralAir, data=train))
plot(log(train$YearBuilt), train$YearRemodAdd)
boxplot(train$LotFrontage)
histogram(train$YearBuilt)
summary(train$Neighborhood)
# If you want to remove the outliers, you can use the filter function of dplyr.
# Numbers below are as a result of visual inspection, not any statistical analyses.
trainNoOutlier <- filter(train,
SalePrice < 600000 ,
LotArea < 100000 ,
GrLivArea < 4000 ,
GarageCars < 3 ,
TotalBsmtSF < 3000
)
dim(trainNoOutlier)
set.seed(314)
pp <- c("center", "scale")
ppo <- preProcess(train, "medianImpute")
train2 <- predict(ppo, train)
control <- trainControl(method="repeatedcv", number=6, repeats=10)
# glm doesn't have any fine tuning, but you need fine tuning for the project.
# Look at caret examples in class notes for tuneGrid.
# Every method has a different parameter to fine tune. Search the Internet to find how to tune your method in caret.
glm1 <- train(log(SalePrice) ~ sqrt(TotalBsmtSF) + sqrt(X1stFlrSF) + sqrt(GarageYrBlt) + sqrt(GarageArea),
data=train2,
method="glm",
preProcess=pp,
trControl=control
)
glm1
# You can read about plotting residuals against predicted values.
# This actually gives you some insights of how good your model is going to be.
ppotest <-preProcess(test,"medianImpute")
test2 <- predict(ppotest,test)
plot(test2)
# submission
prediction <- predict(glm1, test2)
# Don't submit the predictions as the log of SalePrice.
# Kaggle will also take a log of what you submit and you'll get a bad score
predictionexp <- exp(prediction)
submission <- data.frame(Id = test2$Id, SalePrice = predictionexp)
write.csv(submission, file = "Mult_lm_plz_work.csv", row.names = FALSE)
library(rpart)
library(rpart.plot)
DTM1 <- rpart(SalePrice ~ TotalBsmtSF + X1stFlrSF + GarageYrBlt + GarageArea,
data=train,
method="anova")
DTM1
plot(DTM1)
text(DTM1, cex=0.7) #cex is for scaling the text so it shows on the plot
summary(DTM1)
plot(x=train$SalePrice, y= train$TotalBsmtSF)
plot(x=trainNoOutlier$SalePrice, y=trainNoOutlier$TotalBsmtSF)
first_lm <- lm(trainNoOutlier$SalePrice ~ trainNoOutlier$TotalBsmtSF)
abline(84279.55, 79.31, col="red")
plot(trainNoOutlier$SalePrice ~ trainNoOutlier$TotalBsmtSF)
SalePrice ~ TotalBsmtSF + X1stFlrSF + GarageYrBlt + GarageArea
set.seed(100)
train_numeric <- data.frame(trainNoOutlier$SalePrice, trainNoOutlier$TotalBsmtSF) #trainNoOutlier$X1stFlrSF
train_numeric
housing_km <- kmeans(train_numeric, centers=5, nstart=20)
housing_km
#moderatly ok
plot(train_numeric, col=housing_km$cluster)
housing_km$centers
#####multiple linear reggr
# pre_processed_no_outlier <- preProcess(trainNoOutlier, "medianImpute")
#^^ returned 0?!?!?
library(RANN)
# ppo <- preProcess(trainNoOutlier, c("knnImpute"))
ppo <- preProcess(trainNoOutlier, "medianImpute")
train2 <- predict(ppo, trainNoOutlier)
housing_lm <- lm(SalePrice ~ TotalBsmtSF + X1stFlrSF + GarageYrBlt + GarageArea + YrSold + MoSold + LotArea + BedroomAbvGr, train2)
summary(housing_lm)
housing_lm
plot(housing_lm$fitted.values, housing_lm$residuals)
qqnorm(housing_lm$fitted.values, lyab="Residual Quantiles")
ppotest <-preProcess(test,"medianImpute")
test2 <- predict(ppotest, test)
housing_lm_predict <- predict(housing_lm, test2)
# housing_lm_exp <- exp(housing_lm_predict) #never took a log
housing_lm_predict
# submission
summary(housing_lm)
submission <- data.frame(Id = test$Id, SalePrice = housing_lm_predict)
write.csv(submission, file = "Mult_lm_plz_work4.csv", row.names = FALSE)
# random forest testing
library(hydroGOF)
train3 <- train
train3$GarageYrBlt[is.na(train$GarageYrBlt)] <- 0
train3$MasVnrArea[is.na(train$MasVnrArea)] <- 0
train3$LotFrontage[is.na(train$LotFrontage)] <- 0
#Interactions based on correlation
train3$year_qual <- train$YearBuilt*train$OverallQual #overall condition
train3$year_r_qual <- train$YearRemodAdd*train$OverallQual #quality x remodel
train3$qual_bsmt <- train$OverallQual*train$TotalBsmtSF #quality x basement size
train3$livarea_qual <- train$OverallQual*train$GrLivArea #quality x living area
train3$qual_bath <- train$OverallQual*train$FullBath #quality x baths
train3$qual_ext <- train$OverallQual*train$exterior_cond #quality x exterior
library(randomForest)
ppo <- preProcess(trainNoOutlier, "medianImpute")
train2 <- predict(ppo, trainNoOutlier)
model_1 <- randomForest(SalePrice ~ TotalBsmtSF + X1stFlrSF + GarageYrBlt + GarageArea + YrSold + LotArea + BedroomAbvGr, data=train2)
ppotest <-preProcess(test,"medianImpute")
test2 <- predict(ppotest, test)
prediction <- predict(model_1, test2)
model_output <- cbind(test2, prediction)
model_output$log_prediction <- log(model_output$prediction)
model_output$log_SalePrice <- log(model_output$SalePrice)
#Test with RMSE
rmse(model_output$log_SalePrice,model_output$log_prediction)
submission <- data.frame(Id = model_output$Id, SalePrice = model_output$prediction)
write.csv(submission, file = "Mult_lm_plz_work6.csv", row.names = FALSE)
# some new stuff with caret
#treebag
model_2 <- train(
SalePrice ~ TotalBsmtSF + X1stFlrSF + GarageYrBlt + GarageArea + YrSold + LotArea + BedroomAbvGr,
data=train2,
method="treebag",
preProcess=pp,
trControl=control
)
ppotest <-preProcess(test,"medianImpute")
test2 <- predict(ppotest,test)
plot(test2)
prediction <- predict(model_2, test2)
submission3 <- data.frame(Id = test2$Id, SalePrice = prediction)
write.csv(submission3, file = "Mult_lm_plz_work7.csv", row.names = FALSE)
#different tree
model_3 <- train(
SalePrice ~ TotalBsmtSF + X1stFlrSF + GarageYrBlt + GarageArea + LotArea,
data=train2,
method="blackboost",
preProcess=pp,
trControl=control
)
ppotest <-preProcess(test,"medianImpute")
test2 <- predict(ppotest,test)
plot(test2)
prediction <- predict(model_3, test2)
#not working
#RMSE <- sqrt(mean((test$SalePrice-prediction)^2))
submission3 <- data.frame(Id = test2$Id, SalePrice = prediction)
write.csv(submission3, file = "Mult_lm_plz_work8.csv", row.names = FALSE)
#using gbm
model_4 <- train(
SalePrice ~ TotalBsmtSF + X1stFlrSF + GarageYrBlt + GarageArea + YrSold,
data=train2,
method="gbm",
preProcess=pp,
trControl=control
)
ppotest <-preProcess(test,"medianImpute")
test2 <- predict(ppotest,test)
plot(test2)
prediction <- predict(model_4, test2)
submission5 <- data.frame(Id = test2$Id, SalePrice = prediction)
write.csv(submission5, file = "Mult_lm_plz_work9.csv", row.names = FALSE)
### using log
ppo <- preProcess(trainNoOutlier, c("medianImpute"))
train2 <- predict(ppo, trainNoOutlier)
model_5 <- train(
log(SalePrice) ~ TotalBsmtSF + X1stFlrSF + GarageYrBlt + GarageArea + YrSold + OverallQual, #log(SalePrice) ~ OverallQual + YearBuilt + YearRemodAdd + TotalBsmtSF
data=train2,
method="gbm",
preProcess=pp,
trControl=control,
)
ppotest <-preProcess(test, "medianImpute")
test2 <- predict(ppotest,test)
plot(test2)
prediction <- predict(model_5, test2)
predictionexp <- exp(prediction)
submission5 <- data.frame(Id = test2$Id, SalePrice = predictionexp)
write.csv(submission5, file = "Mult_lm_plz_work14.csv", row.names = FALSE)
# using a tuning grid
ppo <- preProcess(trainNoOutlier, c("medianImpute"))
train2 <- predict(ppo, trainNoOutlier)
gbmGrid <- expand.grid(interaction.depth = c(1, 3, 5, 9, 15),
n.trees = (1:30)*1,
shrinkage = 0.1,
n.minobsinnode = 10)
model_6 <- train(
SalePrice ~ TotalBsmtSF + X1stFlrSF + GarageYrBlt + GarageArea + YrSold + OverallQual, #log(SalePrice) ~ OverallQual + YearBuilt + YearRemodAdd + TotalBsmtSF
data=train2,
method="gbm",
preProcess=pp,
trControl=control,
tuneGrid = gbmGrid
)
ppotest <-preProcess(test, "medianImpute")
test2 <- predict(ppotest,test)
plot(test2)
prediction <- predict(model_6, test2)
#predictionexp <- exp(prediction)
submission5 <- data.frame(Id = test2$Id, SalePrice = prediction)
write.csv(submission5, file = "Mult_lm_plz_work17.csv", row.names = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment