Skip to content

Instantly share code, notes, and snippets.

@ianjohns
Created June 30, 2022 17:28
lightGBM Light Gradient Boosting Machine
library(lightgbm)
library(caret)
library(fastDummies)
#load("~/R_Cars_19/Data/cars_19.Rdata")
title <- "light gbm"
tmp <- cars_19[, c(4, 6, 7, 9:12)]
tmp1 <- dummy_cols(tmp)
tmp1 <- tmp1[,8:36]
d <- data.frame(cars_19[, c(1:3, 5, 8)], tmp1)
m <- as.matrix(d)
set.seed(123)
indices <- sample(1:nrow(cars_19), size = 0.75 * nrow(cars_19))
train <- m[indices,]
test <- m[-indices,]
y_train <- train[,1]
y_test <- test[,1]
train_lgb <- lgb.Dataset(train[,2:34],label=y_train)
test_lgb <- lgb.Dataset.create.valid(train_lgb,test[,2:34],label = y_test)
#base untuned lightgbn
light_gbn_base <- lgb.train(
params = list(
objective = "regression",
metric = "l2"
),
data = train_lgb
)
yhat_fit_base <- predict(light_gbn_base,train[,2:34])
yhat_predict_base <- predict(light_gbn_base,test[,2:34])
rmse_fit_base <- RMSE(y_train,yhat_fit_base)
rmse_predict_base <- RMSE(y_test,yhat_predict_base)
#################
#grid search
#create hyperparameter grid
num_leaves =seq(20,28,1)
max_depth = round(log(num_leaves) / log(2),0)
num_iterations = seq(200,400,50)
early_stopping_rounds = round(num_iterations * .1,0)
hyper_grid <- expand.grid(max_depth = max_depth,
num_leaves =num_leaves,
num_iterations = num_iterations,
early_stopping_rounds=early_stopping_rounds,
learning_rate = seq(.45, .50, .005))
hyper_grid <- unique(hyper_grid)
rmse_fit <- NULL
rmse_predict <- NULL
for (j in 1:nrow(hyper_grid)) {
set.seed(123)
light_gbn_tuned <- lgb.train(
params = list(
objective = "regression",
metric = "l2",
max_depth = hyper_grid$max_depth[j],
num_leaves =hyper_grid$num_leaves[j],
num_iterations = hyper_grid$num_iterations[j],
early_stopping_rounds=hyper_grid$early_stopping_rounds[j],
learning_rate = hyper_grid$learning_rate[j]
#feature_fraction = .9
),
valids = list(test = test_lgb),
data = train_lgb
)
yhat_fit_tuned <- predict(light_gbn_tuned,train[,2:34])
yhat_predict_tuned <- predict(light_gbn_tuned,test[,2:34])
rmse_fit[j] <- RMSE(y_train,yhat_fit_tuned)
rmse_predict[j] <- RMSE(y_test,yhat_predict_tuned)
cat(j, "\n")
}
min(rmse_fit)
min(rmse_predict)
hyper_grid[which.min(rmse_fit),]
hyper_grid[which.min(rmse_predict),]
rmse_diff <- rmse_fit - rmse_predict
rmse_models <- data.frame(rmse_fit,rmse_predict, rmse_diff)
rmse_models_sort <- rmse_models[order(rmse_diff),]
set.seed(123)
light_gbn_final <- lgb.train(
params = list(
objective = "regression",
metric = "l2",
max_depth = 4,
num_leaves =23,
num_iterations = 400,
early_stopping_rounds=40,
learning_rate = .48
#feature_fraction = .8
),
valids = list(test = test_lgb),
data = train_lgb
)
yhat_fit_final <- predict(light_gbn_final,train[,2:34])
yhat_predict_final <- predict(light_gbn_final,test[,2:34])
rmse_fit_final<- RMSE(y_train,yhat_fit_final)
rmse_predict_final <- RMSE(y_test,yhat_predict_final)
plot(y_test,yhat_predict_final,main=title, xlab="actual", ylab="predicted")
abline(lm(yhat_predict_final~y_test))
lgb_imp <- lgb.importance(light_gbn_final)
lgb.plot.importance(lgb_imp)
r <- y_test - yhat_predict_final
sum(abs(r) <= rmse_predict_final) / length(y_test) #[1] 0.7547771
sum(abs(r) <= 2 * rmse_predict_final) / length(y_test) #[1] 0.9522293
summary(r)
#Min. 1st Qu. Median Mean 3rd Qu. Max.
#-11.21159 -0.96398 0.06337 -0.02708 0.96796 5.77861
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment