Forked from dylanbeaudette/help-this-poor-RF-model.R
Last active
July 5, 2021 10:30
-
-
Save thengl/1bbf4eca7739769a9a555b3b2f00e28b to your computer and use it in GitHub Desktop.
Ensemble Machine Learning showing how to combine models with applicable structures and non-linear decision trees, without under-estimating errors / producing artifacts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## orignal script: https://gist.github.com/dylanbeaudette/b386f0008133167f518960a113283a0d#file-help-this-poor-rf-model-r | |
## discussion: https://twitter.com/DylanBeaudette/status/1410666900581851138 | |
library(randomForest) | |
library(ranger) | |
library(rpart) | |
library(rms) | |
library(mlr) | |
library(forestError) | |
rmse <- function(a, b) { | |
sqrt(mean((a - b)^2)) | |
} | |
## Synthetic data ---- | |
set.seed(200) | |
n = 100 | |
x <- 1:n | |
y <- x + rnorm(n = 50, mean = 15, sd = 15) | |
# y <- x * sin(x/10) + rnorm(n = 50, mean = 15, sd = 15) | |
# plot(y ~ x) | |
m <- lm(y ~ x) | |
m.rcs <- ols(y ~ rcs(x)) | |
#rf <- randomForest(y ~ x) | |
dat <- data.frame(x,y) | |
rf = randomForest::randomForest(data.frame(x=x), y, nodesize = 5, ntree = 100, keep.inbag = TRUE) | |
rp <- rpart(y ~ x) | |
## newdata | |
d <- data.frame( | |
x = -100:200 | |
) | |
d$lm <- predict(m, newdata = d) | |
#d$rf <- predict(rf, newdata = d) | |
## Fir Ensemble Model using mlr ---- | |
lrns <- lapply(c("regr.ranger", "regr.glm", "regr.nnet"), mlr::makeLearner) | |
tsk <- mlr::makeRegrTask(data = dat, target = "y") | |
init.m <- mlr::makeStackedLearner(lrns, method = "stack.cv", super.learner = "regr.lm", resampling=mlr::makeResampleDesc(method = "CV")) | |
eml = train(init.m, tsk) | |
summary(eml$learner.model$super.model$learner.model) | |
# Residuals: | |
# Min 1Q Median 3Q Max | |
# -27.9489 -10.6332 -0.3286 11.9130 29.3213 | |
# | |
# Coefficients: | |
# Estimate Std. Error t value Pr(>|t|) | |
# (Intercept) 0.25617 4.08073 0.063 0.950076 | |
# regr.ranger -0.75309 0.18587 -4.052 0.000103 *** | |
# regr.glm 1.73970 0.19991 8.703 9.08e-14 *** | |
# regr.nnet 0.01188 0.10312 0.115 0.908515 | |
# --- | |
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 | |
# | |
# Residual standard error: 13.84 on 96 degrees of freedom | |
# Multiple R-squared: 0.8141, Adjusted R-squared: 0.8083 | |
# F-statistic: 140.2 on 3 and 96 DF, p-value: < 2.2e-16 | |
## GLM is the best model, RF and NNet can be skipped probably | |
m.train = eml$learner.model$super.model$learner.model$model | |
m.terms = eml$learner.model$super.model$learner.model$terms | |
quantregModel <- ranger::ranger(m.terms, m.train, importance="impurity", quantreg=TRUE, keep.inbag = TRUE) | |
## Estimate EML prediction error ---- | |
#quantiles = c((1-.682)/2, 1-(1-.682)/2) | |
#pr.rf = forestError::quantForestError(rf, X.train=data.frame(x=x), X.test=data.frame(x = -100:200), Y.train=y, alpha = (1-(quantiles[2]-quantiles[1]))) | |
## correction factor: | |
eml.MSE0 = matrixStats::rowSds(as.matrix(m.train[,all.vars(m.terms)[-1]]), na.rm=TRUE)^2 | |
eml.MSE = deviance(eml$learner.model$super.model$learner.model)/df.residual(eml$learner.model$super.model$learner.model) | |
## mass-preservation of MSE | |
eml.cf = eml.MSE/mean(eml.MSE0, na.rm = TRUE) | |
pred = mlr::getStackedBaseLearnerPredictions(eml, newdata=data.frame(x = -100:200)) | |
rf.sd = sqrt(matrixStats::rowSds(as.matrix(as.data.frame(pred)), na.rm=TRUE)^2 * eml.cf) | |
#pr.rf = forestError::quantForestError(quantregModel, X.train=m.train[,all.vars(m.terms)[-1]], X.test=as.data.frame(pred), Y.train=m.train[,all.vars(m.terms)[1]], alpha = (1-(quantiles[2]-quantiles[1]))) | |
d$rp <- predict(rp, newdata = d) | |
d$rcs <- predict(m.rcs, newdata = d) | |
d$rf = predict(eml, newdata = d)$data$response | |
#d$rf = pr.rf$estimates$pred | |
## RMSE | |
rmse.lm <- round(rmse(y, predict(m)), 1) | |
#rmse.rf <- round(rmse(y, predict(rf)), 1) | |
#rmse.rf <- round(mean(sqrt(pr.rf$estimates$mspe)), 1) | |
rmse.rf <- round(sqrt(eml.MSE), 1) | |
rmse.rp <- round(rmse(y, predict(rp)), 1) | |
rmse.rcs <- round(rmse(y, predict(m.rcs)), 1) | |
#rf.all <- predict(rf, d, predict.all=TRUE) | |
#sds.rf <- apply(rf.all$individual, 1, sd) | |
# leg.txt <- sprintf("%s (%s)", c('lm', 'randomForest', 'rpart', 'lm + RCS'), c(rmse.lm, rmse.rf, rmse.rp, rmse.rcs)) | |
leg.txt <- sprintf("%s (%s)", c('lm', 'Ensemble ML'), c(rmse.lm, rmse.rf)) | |
par(mar = c(0, 0, 0, 0), fg = 'black', bg = 'white') | |
plot(y ~ x, xlim = c(-25, 125), ylim = c(-50, 150), type = 'n', axes = FALSE) | |
grid() | |
points(y ~ x, cex = 1, pch = 16, las = 1) | |
lines(lm ~ x, data = d, col = 2, lwd = 2) | |
lines(rf ~ x, data = d, col = 4, lwd = 2) | |
#lines(d$x,d$rf+sds.rf,lty=2,col=4) | |
#lines(d$x,d$rf-sds.rf,lty=2,col=4) | |
#lines(d$x, pr.rf$estimates$lower_0.318, lty=2,col=4) | |
#lines(d$x, pr.rf$estimates$upper_0.318, lty=2,col=4) | |
lines(d$x, d$rf+rmse.rf+rf.sd, lty=2, col=4) | |
lines(d$x, d$rf-(rmse.rf+rf.sd), lty=2, col=4) | |
# lines(rp ~ x, data = d, col = 3, lwd = 2) | |
# lines(rcs ~ x, data = d, col = 6, lwd = 2) | |
legend('bottom', legend = leg.txt, lwd = 2, lty = 1, col = c(2, 4, 3), horiz = TRUE, title = 'RMSE') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment