Skip to content

Instantly share code, notes, and snippets.

@ivopbernardo
Last active November 2, 2022 09:30
Show Gist options
  • Save ivopbernardo/2f3a860222df3b43d551fe2a969f99b6 to your computer and use it in GitHub Desktop.
Save ivopbernardo/2f3a860222df3b43d551fe2a969f99b6 to your computer and use it in GitHub Desktop.
Data Science Tutorials Blog Post Series: Training a Decision Tree using R
# Training a decision tree in R - used in blog post:
# https://medium.com/codex/data-science-tutorials-training-a-decision-tree-using-r-d6266936d86
library(dplyr)
library(rpart)
library(rpart.plot)
library(caret)
library(Metrics)
library(ggplot2)
# Load London Bike CSV File
london_bike <- read.csv('./london_merged.csv')
# Using a function to split into train and test
train_test_split <- function(data, percentage) {
data_with_row_id <- data %>%
mutate(id = row_number())
set.seed(1234)
training_data <- data_with_row_id %>%
sample_frac(percentage)
test_data <- anti_join(
data_with_row_id,
training_data,
by='id'
)
training_data$id <- NULL
test_data$id <- NULL
return (list(training_data, test_data))
}
# Keeping 80% for the training set
training_data <- train_test_split(
london_bike, 0.8
)[[1]]
test_data <- train_test_split(
london_bike, 0.8
)[[2]]
# Subsetting only the features and target
training_data <- training_data[,c('t1','t2','hum',
'wind_speed','weather_code',
'is_holiday','is_weekend',
'season', 'cnt')]
test_data <- test_data[,c('t1','t2','hum',
'wind_speed','weather_code',
'is_holiday','is_weekend',
'season', 'cnt')]
# Fitting Decision Tree Model using rpart
set.seed(1234)
dtree_1 <- rpart(formula = cnt ~ .,
data = training_data,
control = list(minbucket=120, maxdepth=4),
method = 'anova'
)
# Plotting the Tree
prp(dtree_1)
# Timing the execution
system.time(
dtree_1 <- rpart(formula = cnt ~ .,
data = training_data,
control = list(minbucket=120, maxdepth=4),
method = 'anova'
))
# Assessing performance using RMSE
rmse(
test_data$cnt,
predict(dtree_1, test_data)
)
# Adding a new hyperparameter
set.seed(1234)
dtree_2 <- rpart(formula = cnt ~ .,
data = training_data,
control = list(minbucket=10,
maxdepth=10,
minsplit=10,
cp=0.001),
method = 'anova'
)
# Checking parameters using help function
?rpart.control
# Assessing execution time
system.time(rpart(formula = cnt ~ .,
data = training_data,
control = list(minbucket=10,
maxdepth=10,
minsplit=10,
cp=0.001),
method = 'anova'))
# Assessing performance
rmse(
test_data$cnt,
predict(dtree_2, test_data)
)
# Plotting the extended tree
prp(dtree_2)
# Using Caret
caret_tree <- train(form = cnt ~ .,
data = training_data,
method="rpart",
control = rpart.control(minsplit = 10,
minbucket = 10,
maxdepth = 10
))
# Assessing execution time
system.time(train(form = cnt ~ .,
data = training_data,
method="rpart",
control = rpart.control(minsplit = 10,
minbucket = 10,
maxdepth = 10
)))
# Drawing our Tree trained with Caret
prp(caret_tree$finalModel)
# Assessing performance of Caret Tree
rmse(
test_data$cnt,
predict(caret_tree$finalModel, test_data)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment