Skip to content

Instantly share code, notes, and snippets.

@ivopbernardo
Last active November 2, 2022 09:31
Show Gist options
  • Save ivopbernardo/fced9da45c6756073f5140ee5b1301e0 to your computer and use it in GitHub Desktop.
Save ivopbernardo/fced9da45c6756073f5140ee5b1301e0 to your computer and use it in GitHub Desktop.
xgboostr.r
# Training an XGBoost in R - used in blog post:
# https://towardsdatascience.com/data-science-tutorials-training-an-xgboost-using-r-cf3c00b1425
library(dplyr)
library(xgboost)
library(Metrics)
library(ggplot2)
# Load london bike csv
london_bike <- read.csv('./london_merged.csv')
# Using a function to split into train and test
train_test_split <- function(data, percentage) {
data_with_row_id <- data %>%
mutate(id = row_number())
set.seed(1234)
training_data <- data_with_row_id %>%
sample_frac(percentage)
test_data <- anti_join(
data_with_row_id,
training_data,
by='id'
)
training_data$id <- NULL
test_data$id <- NULL
return (list(training_data, test_data))
}
# Keeping 80% for the training set
training_data <- train_test_split(
london_bike, 0.8
)[[1]]
test_data <- train_test_split(
london_bike, 0.8
)[[2]]
# Subsetting only the features
X_train <- training_data[,c('t1','t2','hum',
'wind_speed','weather_code',
'is_holiday','is_weekend',
'season')]
X_test <- test_data[,c('t1','t2','hum',
'wind_speed','weather_code',
'is_holiday','is_weekend',
'season')]
# Defining target variable
y_train <- training_data$cnt
y_test <- test_data$cnt
# Fitting XGBoost Model
set.seed(1234)
xgb <- xgboost(data = as.matrix(X_train),
label = y_train,
nround = 10)
# Acessing hyperparameters
?xgboost
# Timing the execution
system.time(
xgboost(data = as.matrix(X_train),
label = y_train,
nround = 10))
# Assessing performance
rmse(
y_test,
predict(xgb, as.matrix(X_test))
)
# Adding a new hyperparameter
set.seed(1234)
xgb_ext <- xgboost(data = as.matrix(X_train),
label = y_train,
nround = 50,
max_depth=20)
# Plotting RMSE throughout iterations
ggplot(
data = xgb_ext$evaluation_log,
aes(x=iter, y=train_rmse)
) + geom_line(color='darkred') + ylab('RMSE') + xlab('Iteration Number') + theme_light() + geom_point()
# Assessing execution time
system.time(xgboost(data = as.matrix(X_train),
label = y_train,
nround = 50,
max_depth=20)
)
# Assessing performance
rmse(
y_test,
predict(xgb_ext, as.matrix(X_test))
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment