Last active
September 9, 2019 23:20
-
-
Save erikcs/cd3490e3d59287b1ef805ea840d5d556 to your computer and use it in GitHub Desktop.
grf vs randomForest and ranger on UCI bike data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# randomForest and ranger with parameters tuned by caret vs grf on | |
# bike data from UCI ml database (n=17 000 p=12) | |
# randomForest and ranger parameters from caret output (subsample of full data) | |
# @mattschaelling https://gist.github.com/mattschaelling/b592883f507fbe9b89f39451c1c49fd3 | |
library(grf) | |
library(ranger) | |
library(tuneRanger) | |
library(randomForest) | |
### Bike Sharing Data ### | |
######################### | |
# See https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset for documentation | |
# download zipfile with data | |
library(tidyverse) | |
temp_bike <- tempfile() | |
download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip", temp_bike) | |
# # Load raw data | |
bike_df <- readr::read_csv((unz(temp_bike, "hour.csv")), na = character()) | |
bike_df <- bike_df %>% | |
rename(Y = cnt) %>% | |
select(-instant, -dteday, -registered, -casual) | |
# delete zip file | |
unlink(temp_bike) | |
## rmse on OOB predictions on a subset of the bike data | |
data <- bike_df | |
# set.seed(123) | |
n <- 2500 | |
data <- data[sample(1:n, n), ] | |
data %>% summarise_all(.funs=n_distinct) | |
# rf | |
# The one tunable param from caret (caretoutput$bestTune) | |
rf <- randomForest(Y ~ ., data = data, mtry = 12) | |
print(rmse.rf.tuned <- sqrt(mean((data$Y - predict(rf))^2))) | |
# ranger | |
ranger <- ranger(Y ~ ., data = data) | |
print(rmse.ranger.default <- sqrt(mean((data$Y - ranger$predictions)^2))) | |
# Three tunable params from caret (caretoutput$bestTune) | |
ranger <- ranger(Y ~ ., data = data, splitrule = "extratrees", mtry = 12, min.node.size = 4) | |
print(rmse.ranger.tuned <- sqrt(mean((data$Y - ranger$predictions)^2))) | |
# ranger with tuneRanger https://arxiv.org/pdf/1804.03515.pdf | |
data.task = makeRegrTask(data = data, target = "Y") | |
res = tuneRanger(data.task, num.trees = 1000, | |
num.threads = 2, iters = 70, save.file.path = NULL) | |
res | |
ranger <- ranger(Y ~ ., data = data, sample.fraction = 0.705023, mtry = 12, min.node.size = 2) | |
print(rmse.ranger.tuneRanger <- sqrt(mean((data$Y - ranger$predictions)^2))) | |
# grf | |
datagrf <- as.matrix(data) | |
Yi <- 13 | |
grf <- regression_forest(X = datagrf[, -Yi], Y = datagrf[, Yi]) | |
print(rmse.grf.default <- sqrt(mean((datagrf[, Yi] - predict(grf)$predictions)^2))) | |
grf <- regression_forest(X = datagrf[, -Yi], Y = datagrf[, Yi], tune.parameters = TRUE) | |
print(rmse.grf.tuned <- sqrt(mean((datagrf[, Yi] - predict(grf)$predictions)^2))) | |
grf <- regression_forest(X = datagrf[, -Yi], Y = datagrf[, Yi], tune.parameters = TRUE, num.trees = 10000) | |
print(rmse.grf.tuned.10k.num.trees <- sqrt(mean((datagrf[, Yi] - predict(grf)$predictions)^2))) | |
grf <- regression_forest(X = datagrf[, -Yi], Y = datagrf[, Yi], | |
tune.parameters = TRUE, num.fit.trees = 150, | |
num.fit.reps = 150) | |
print(rmse.grf.tuned.tweaked <- sqrt(mean((datagrf[, Yi] - predict(grf)$predictions)^2))) | |
grf <- regression_forest(X = datagrf[, -Yi], Y = datagrf[, Yi], honesty = FALSE) | |
print(rmse.grf.dishonest <- sqrt(mean((datagrf[, Yi] - predict(grf)$predictions)^2))) | |
grf <- regression_forest(X = datagrf[, -Yi], Y = datagrf[, Yi], honesty = FALSE, tune.parameters = TRUE) | |
print(rmse.grf.dishonest.tuned <- sqrt(mean((datagrf[, Yi] - predict(grf)$predictions)^2))) | |
# Summary | |
t(data.frame( | |
rmse.rf.tuned, rmse.ranger.default, rmse.ranger.tuned, rmse.ranger.tuneRanger, | |
rmse.grf.default, rmse.grf.tuned, rmse.grf.tuned.10k.num.trees, rmse.grf.tuned.tweaked, | |
rmse.grf.dishonest, rmse.grf.dishonest.tuned | |
)) | |
# rmse.rf.tuned 21.63832 | |
# rmse.ranger.default 33.60643 | |
# rmse.ranger.tuned 23.19322 | |
# rmse.ranger.tuneRanger 21.97965 | |
# rmse.grf.default 29.42733 | |
# rmse.grf.tuned 28.18083 | |
# rmse.grf.tuned.10k.num.trees 28.42887 | |
# rmse.grf.tuned.tweaked 25.60646 | |
# rmse.grf.dishonest 23.21979 | |
# rmse.grf.dishonest.tuned 23.46529 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment