Skip to content

Instantly share code, notes, and snippets.

@erikcs
Last active September 9, 2019 23:20
Show Gist options
  • Save erikcs/cd3490e3d59287b1ef805ea840d5d556 to your computer and use it in GitHub Desktop.
Save erikcs/cd3490e3d59287b1ef805ea840d5d556 to your computer and use it in GitHub Desktop.
grf vs randomForest and ranger on UCI bike data
# randomForest and ranger with parameters tuned by caret vs grf on
# bike data from UCI ml database (n=17 000 p=12)
# randomForest and ranger parameters from caret output (subsample of full data)
# @mattschaelling https://gist.github.com/mattschaelling/b592883f507fbe9b89f39451c1c49fd3
library(grf)
library(ranger)
library(tuneRanger)
library(randomForest)
### Bike Sharing Data ###
#########################
# See https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset for documentation
# download zipfile with data
library(tidyverse)
temp_bike <- tempfile()
download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip", temp_bike)
# # Load raw data
bike_df <- readr::read_csv((unz(temp_bike, "hour.csv")), na = character())
bike_df <- bike_df %>%
rename(Y = cnt) %>%
select(-instant, -dteday, -registered, -casual)
# delete zip file
unlink(temp_bike)
## rmse on OOB predictions on a subset of the bike data
data <- bike_df
# set.seed(123)
n <- 2500
data <- data[sample(1:n, n), ]
data %>% summarise_all(.funs=n_distinct)
# rf
# The one tunable param from caret (caretoutput$bestTune)
rf <- randomForest(Y ~ ., data = data, mtry = 12)
print(rmse.rf.tuned <- sqrt(mean((data$Y - predict(rf))^2)))
# ranger
ranger <- ranger(Y ~ ., data = data)
print(rmse.ranger.default <- sqrt(mean((data$Y - ranger$predictions)^2)))
# Three tunable params from caret (caretoutput$bestTune)
ranger <- ranger(Y ~ ., data = data, splitrule = "extratrees", mtry = 12, min.node.size = 4)
print(rmse.ranger.tuned <- sqrt(mean((data$Y - ranger$predictions)^2)))
# ranger with tuneRanger https://arxiv.org/pdf/1804.03515.pdf
data.task = makeRegrTask(data = data, target = "Y")
res = tuneRanger(data.task, num.trees = 1000,
num.threads = 2, iters = 70, save.file.path = NULL)
res
ranger <- ranger(Y ~ ., data = data, sample.fraction = 0.705023, mtry = 12, min.node.size = 2)
print(rmse.ranger.tuneRanger <- sqrt(mean((data$Y - ranger$predictions)^2)))
# grf
datagrf <- as.matrix(data)
Yi <- 13
grf <- regression_forest(X = datagrf[, -Yi], Y = datagrf[, Yi])
print(rmse.grf.default <- sqrt(mean((datagrf[, Yi] - predict(grf)$predictions)^2)))
grf <- regression_forest(X = datagrf[, -Yi], Y = datagrf[, Yi], tune.parameters = TRUE)
print(rmse.grf.tuned <- sqrt(mean((datagrf[, Yi] - predict(grf)$predictions)^2)))
grf <- regression_forest(X = datagrf[, -Yi], Y = datagrf[, Yi], tune.parameters = TRUE, num.trees = 10000)
print(rmse.grf.tuned.10k.num.trees <- sqrt(mean((datagrf[, Yi] - predict(grf)$predictions)^2)))
grf <- regression_forest(X = datagrf[, -Yi], Y = datagrf[, Yi],
tune.parameters = TRUE, num.fit.trees = 150,
num.fit.reps = 150)
print(rmse.grf.tuned.tweaked <- sqrt(mean((datagrf[, Yi] - predict(grf)$predictions)^2)))
grf <- regression_forest(X = datagrf[, -Yi], Y = datagrf[, Yi], honesty = FALSE)
print(rmse.grf.dishonest <- sqrt(mean((datagrf[, Yi] - predict(grf)$predictions)^2)))
grf <- regression_forest(X = datagrf[, -Yi], Y = datagrf[, Yi], honesty = FALSE, tune.parameters = TRUE)
print(rmse.grf.dishonest.tuned <- sqrt(mean((datagrf[, Yi] - predict(grf)$predictions)^2)))
# Summary
t(data.frame(
rmse.rf.tuned, rmse.ranger.default, rmse.ranger.tuned, rmse.ranger.tuneRanger,
rmse.grf.default, rmse.grf.tuned, rmse.grf.tuned.10k.num.trees, rmse.grf.tuned.tweaked,
rmse.grf.dishonest, rmse.grf.dishonest.tuned
))
# rmse.rf.tuned 21.63832
# rmse.ranger.default 33.60643
# rmse.ranger.tuned 23.19322
# rmse.ranger.tuneRanger 21.97965
# rmse.grf.default 29.42733
# rmse.grf.tuned 28.18083
# rmse.grf.tuned.10k.num.trees 28.42887
# rmse.grf.tuned.tweaked 25.60646
# rmse.grf.dishonest 23.21979
# rmse.grf.dishonest.tuned 23.46529
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment