Skip to content

Instantly share code, notes, and snippets.

@ianjohns
Last active June 18, 2019 18:53
#raw data
# https://www.fueleconomy.gov/feg/epadata/19data.zip
library(gbm)
library(caret)
r <- function(data) {round(data, 1)}
set.seed(123)
indices <- sample(nrow(cars_19), size = .75 * nrow(cars_19))
train <- cars_19[indices, ]
test <- cars_19[-indices, ]
names <- cars[-indices, c(3, 4)]
title <- "Gradient Boosted Model"
trees <- 1200
m_boosted_reg_untuned <- gbm(
formula = fuel_economy_combined ~ .,
data = train,
n.trees = trees,
distribution = "gaussian"
)
pred_boosted_reg_untuned <- predict(m_boosted_reg_untuned,n.trees=trees, newdata = test)
mse_boosted_reg_untuned <- RMSE(pred = pred_boosted_reg_untuned, obs = test$fuel_economy_combined) ^2
boosted_stats_untuned<-postResample(pred_boosted_reg_untuned,test$fuel_economy_combined)
#create hyperparameter grid
hyper_grid <- expand.grid(
shrinkage = seq(.07, .12, .01),
interaction.depth = 1:7,
optimal_trees = 0,
min_RMSE = 0
)
#grid search
for(i in 1:nrow(hyper_grid)) {
set.seed(123)
gbm.tune <- gbm(
formula = fuel_economy_combined ~ .,
data = train_random,
distribution = "gaussian",
n.trees = 5000,
interaction.depth = hyper_grid$interaction.depth[i],
shrinkage = hyper_grid$shrinkage[i],
)
hyper_grid$optimal_trees[i] <- which.min(gbm.tune$train.error)
hyper_grid$min_RMSE[i] <- sqrt(min(gbm.tune$train.error))
cat(i,"\n")
}
m_boosted_reg <- gbm(
formula = fuel_economy_combined ~ .,
data = train,
n.trees = trees,
distribution = "gaussian",
shrinkage = .09,
cv.folds = 5,
interaction.depth = 5
)
best.iter <- gbm.perf(m_boosted_reg, method = "cv")
plot(1:trees,m_boosted_reg$train.error,type="l",ylab="mse",xlab = "number of trees", main = title)
which.min(m_boosted_reg$train.error)
which.min(m_boosted_reg$cv.error)
m_boosted_reg
summary(m_boosted_reg)
pred_boosted_reg <- predict(m_boosted_reg,n.trees=1183, newdata = test)
mse_boosted_reg <- RMSE(pred = pred_boosted_reg, obs = test$fuel_economy_combined) ^2
boosted_stats<-postResample(pred_boosted_reg,test$fuel_economy_combined)
tmp <- data.frame(names, test$fuel_economy_combined, r(pred_boosted_reg))
tmp <- tmp[order(tmp$Division, tmp$Carline),]
names(tmp)[3] <- "fuel_economy_combined"
names(tmp)[4] <- "pred_boosted_reg"
res <- r(tmp$fuel_economy_combined - tmp$pred_boosted_reg)
tmp[which(abs(res) > boosted_stats[1] * 3), ] #what cars are 3 se residuals
plot(res, ylab = "residuals", main = title)
plot(
test$fuel_economy_combined,
pred_boosted_reg_full,
main = title,
ylab = "predicted",
xlab = "actual"
)
abline(lm(pred_boosted_reg_full ~ test$fuel_economy_combined))
par(mar=c(5,9,5,1))
summary(m_boosted_reg,cBars=11,method=relative.influence,las=2,xlim=c(0,50))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment