Skip to content

Instantly share code, notes, and snippets.

@ivopbernardo
Last active November 2, 2022 09:31
Show Gist options
  • Save ivopbernardo/7ffdc7a32ea98d47357b15985913ce7c to your computer and use it in GitHub Desktop.
Save ivopbernardo/7ffdc7a32ea98d47357b15985913ce7c to your computer and use it in GitHub Desktop.
# Training a Random Forest in R - used in blog post:
# https://towardsdatascience.com/data-science-tutorials-training-a-random-forest-in-r-a883cc1bacd1
library(dplyr)
library(randomForest)
library(ranger)
library(Metrics)
# Load london bike csv
london_bike <- read.csv('./london_merged.csv')
# Using a function to split into train and test
train_test_split <- function(data, percentage) {
data_with_row_id <- data %>%
mutate(id = row_number())
set.seed(1234)
training_data <- data_with_row_id %>%
sample_frac(percentage)
test_data <- anti_join(
data_with_row_id,
training_data,
by='id'
)
training_data$id <- NULL
test_data$id <- NULL
return (list(training_data, test_data))
}
# Keeping 80% for the training set
training_data <- train_test_split(london_bike, 0.8)[[1]]
test_data <- train_test_split(london_bike, 0.8)[[2]]
# Subsetting only target and features
training_data <- training_data[,c('t1','t2','hum',
'wind_speed','weather_code',
'is_holiday','is_weekend',
'season', 'cnt')]
test_data <- test_data[,c('t1','t2','hum',
'wind_speed','weather_code',
'is_holiday','is_weekend',
'season', 'cnt')]
# Fitting Random Forest
set.seed(1234)
rf <- randomForest(formula = cnt ~ .,
data = training_data,
ntree = 100)
# Timing the execution
system.time(
randomForest(cnt ~ ., data = training_data,
ntree = 100))
# Adding a new hyperparameter
rf_2 <- randomForest(formula = cnt ~ .,
data = training_data,
ntree = 100,
nodesize = 10)
# Assessing performance
rmse(test_data$cnt, predict(rf, test_data))
# Implementation with Ranger
set.seed(1234)
rf_ranger <- ranger(
formula = cnt ~ .,
data = training_data,
num.trees=100)
# Let's check the execution time
system.time(rf_ranger <- ranger(
formula = cnt ~ .,
data = training_data,
num.trees=100))
# Adding hyperparameters
rf_ranger_2 <- ranger(formula = cnt ~ .,
data = training_data,
num.trees=100,
min.node.size = 10)
# Assessing performance
rmse(
test_data$cnt,
predict(rf_ranger, test_data)$predictions
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment