ianjohns/gradient_boosting_machine.R

## gradient_boosting_machine.R
#raw data
# https://www.fueleconomy.gov/feg/epadata/19data.zip

library(gbm)
library(caret)

r <- function(data) {round(data, 1)}

set.seed(123)
indices <- sample(nrow(cars_19), size = .75 * nrow(cars_19))
train <- cars_19[indices, ]
test <- cars_19[-indices, ]
names <- cars[-indices, c(3, 4)]

title <- "Gradient Boosted Model"

trees <- 1200
m_boosted_reg_untuned <- gbm(
  formula = fuel_economy_combined ~ .,
  data    = train,
  n.trees = trees,
  distribution = "gaussian"
)

pred_boosted_reg_untuned <- predict(m_boosted_reg_untuned,n.trees=trees, newdata = test)
mse_boosted_reg_untuned <- RMSE(pred = pred_boosted_reg_untuned, obs = test$fuel_economy_combined) ^2
boosted_stats_untuned<-postResample(pred_boosted_reg_untuned,test$fuel_economy_combined)

#create hyperparameter grid
hyper_grid <- expand.grid(
  shrinkage = seq(.07, .12, .01),
  interaction.depth = 1:7,
  optimal_trees = 0,
  min_RMSE = 0
)

#grid search
for(i in 1:nrow(hyper_grid)) {
  set.seed(123)
  gbm.tune <- gbm(
    formula = fuel_economy_combined ~ .,
    data = train_random,
    distribution = "gaussian",
    n.trees = 5000,
    interaction.depth = hyper_grid$interaction.depth[i],
    shrinkage = hyper_grid$shrinkage[i],
  )

  hyper_grid$optimal_trees[i] <- which.min(gbm.tune$train.error)
  hyper_grid$min_RMSE[i] <- sqrt(min(gbm.tune$train.error))

  cat(i,"\n")
}

m_boosted_reg <- gbm(
  formula = fuel_economy_combined ~ .,
  data    = train,
  n.trees = trees,
  distribution = "gaussian",
  shrinkage = .09,
  cv.folds = 5,
  interaction.depth = 5
)

best.iter <- gbm.perf(m_boosted_reg, method = "cv")

plot(1:trees,m_boosted_reg$train.error,type="l",ylab="mse",xlab = "number of trees", main = title)
which.min(m_boosted_reg$train.error)
which.min(m_boosted_reg$cv.error)

m_boosted_reg
summary(m_boosted_reg)

pred_boosted_reg <- predict(m_boosted_reg,n.trees=1183, newdata = test)
mse_boosted_reg <- RMSE(pred = pred_boosted_reg, obs = test$fuel_economy_combined) ^2
boosted_stats<-postResample(pred_boosted_reg,test$fuel_economy_combined)

tmp <- data.frame(names, test$fuel_economy_combined, r(pred_boosted_reg))
tmp <- tmp[order(tmp$Division, tmp$Carline),]
names(tmp)[3] <- "fuel_economy_combined"
names(tmp)[4] <- "pred_boosted_reg"

res <- r(tmp$fuel_economy_combined - tmp$pred_boosted_reg)
tmp[which(abs(res) > boosted_stats[1] * 3), ] #what cars are 3 se residuals

plot(res, ylab = "residuals", main = title)
plot(
  test$fuel_economy_combined,
  pred_boosted_reg_full,
  main = title,
  ylab = "predicted",
  xlab = "actual"
)
abline(lm(pred_boosted_reg_full ~ test$fuel_economy_combined))

par(mar=c(5,9,5,1))
summary(m_boosted_reg,cBars=11,method=relative.influence,las=2,xlim=c(0,50))
	#raw data
	# https://www.fueleconomy.gov/feg/epadata/19data.zip

	library(gbm)
	library(caret)

	r <- function(data) {round(data, 1)}

	set.seed(123)
	indices <- sample(nrow(cars_19), size = .75 * nrow(cars_19))
	train <- cars_19[indices, ]
	test <- cars_19[-indices, ]
	names <- cars[-indices, c(3, 4)]

	title <- "Gradient Boosted Model"

	trees <- 1200
	m_boosted_reg_untuned <- gbm(
	formula = fuel_economy_combined ~ .,
	data = train,
	n.trees = trees,
	distribution = "gaussian"
	)

	pred_boosted_reg_untuned <- predict(m_boosted_reg_untuned,n.trees=trees, newdata = test)
	mse_boosted_reg_untuned <- RMSE(pred = pred_boosted_reg_untuned, obs = test$fuel_economy_combined) ^2
	boosted_stats_untuned<-postResample(pred_boosted_reg_untuned,test$fuel_economy_combined)

	#create hyperparameter grid
	hyper_grid <- expand.grid(
	shrinkage = seq(.07, .12, .01),
	interaction.depth = 1:7,
	optimal_trees = 0,
	min_RMSE = 0
	)

	#grid search
	for(i in 1:nrow(hyper_grid)) {
	set.seed(123)
	gbm.tune <- gbm(
	formula = fuel_economy_combined ~ .,
	data = train_random,
	distribution = "gaussian",
	n.trees = 5000,
	interaction.depth = hyper_grid$interaction.depth[i],
	shrinkage = hyper_grid$shrinkage[i],
	)

	hyper_grid$optimal_trees[i] <- which.min(gbm.tune$train.error)
	hyper_grid$min_RMSE[i] <- sqrt(min(gbm.tune$train.error))

	cat(i,"\n")
	}

	m_boosted_reg <- gbm(
	formula = fuel_economy_combined ~ .,
	data = train,
	n.trees = trees,
	distribution = "gaussian",
	shrinkage = .09,
	cv.folds = 5,
	interaction.depth = 5
	)

	best.iter <- gbm.perf(m_boosted_reg, method = "cv")

	plot(1:trees,m_boosted_reg$train.error,type="l",ylab="mse",xlab = "number of trees", main = title)
	which.min(m_boosted_reg$train.error)
	which.min(m_boosted_reg$cv.error)

	m_boosted_reg
	summary(m_boosted_reg)

	pred_boosted_reg <- predict(m_boosted_reg,n.trees=1183, newdata = test)
	mse_boosted_reg <- RMSE(pred = pred_boosted_reg, obs = test$fuel_economy_combined) ^2
	boosted_stats<-postResample(pred_boosted_reg,test$fuel_economy_combined)

	tmp <- data.frame(names, test$fuel_economy_combined, r(pred_boosted_reg))
	tmp <- tmp[order(tmp$Division, tmp$Carline),]
	names(tmp)[3] <- "fuel_economy_combined"
	names(tmp)[4] <- "pred_boosted_reg"

	res <- r(tmp$fuel_economy_combined - tmp$pred_boosted_reg)
	tmp[which(abs(res) > boosted_stats[1] * 3), ] #what cars are 3 se residuals

	plot(res, ylab = "residuals", main = title)
	plot(
	test$fuel_economy_combined,
	pred_boosted_reg_full,
	main = title,
	ylab = "predicted",
	xlab = "actual"
	)
	abline(lm(pred_boosted_reg_full ~ test$fuel_economy_combined))

	par(mar=c(5,9,5,1))
	summary(m_boosted_reg,cBars=11,method=relative.influence,las=2,xlim=c(0,50))