Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thengl/1bbf4eca7739769a9a555b3b2f00e28b to your computer and use it in GitHub Desktop.
Save thengl/1bbf4eca7739769a9a555b3b2f00e28b to your computer and use it in GitHub Desktop.
Ensemble Machine Learning showing how to combine models with applicable structures and non-linear decision trees, without under-estimating errors / producing artifacts
## orignal script: https://gist.github.com/dylanbeaudette/b386f0008133167f518960a113283a0d#file-help-this-poor-rf-model-r
## discussion: https://twitter.com/DylanBeaudette/status/1410666900581851138
library(randomForest)
library(ranger)
library(rpart)
library(rms)
library(mlr)
library(forestError)
rmse <- function(a, b) {
sqrt(mean((a - b)^2))
}
## Synthetic data ----
set.seed(200)
n = 100
x <- 1:n
y <- x + rnorm(n = 50, mean = 15, sd = 15)
# y <- x * sin(x/10) + rnorm(n = 50, mean = 15, sd = 15)
# plot(y ~ x)
m <- lm(y ~ x)
m.rcs <- ols(y ~ rcs(x))
#rf <- randomForest(y ~ x)
dat <- data.frame(x,y)
rf = randomForest::randomForest(data.frame(x=x), y, nodesize = 5, ntree = 100, keep.inbag = TRUE)
rp <- rpart(y ~ x)
## newdata
d <- data.frame(
x = -100:200
)
d$lm <- predict(m, newdata = d)
#d$rf <- predict(rf, newdata = d)
## Fir Ensemble Model using mlr ----
lrns <- lapply(c("regr.ranger", "regr.glm", "regr.nnet"), mlr::makeLearner)
tsk <- mlr::makeRegrTask(data = dat, target = "y")
init.m <- mlr::makeStackedLearner(lrns, method = "stack.cv", super.learner = "regr.lm", resampling=mlr::makeResampleDesc(method = "CV"))
eml = train(init.m, tsk)
summary(eml$learner.model$super.model$learner.model)
# Residuals:
# Min 1Q Median 3Q Max
# -27.9489 -10.6332 -0.3286 11.9130 29.3213
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 0.25617 4.08073 0.063 0.950076
# regr.ranger -0.75309 0.18587 -4.052 0.000103 ***
# regr.glm 1.73970 0.19991 8.703 9.08e-14 ***
# regr.nnet 0.01188 0.10312 0.115 0.908515
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 13.84 on 96 degrees of freedom
# Multiple R-squared: 0.8141, Adjusted R-squared: 0.8083
# F-statistic: 140.2 on 3 and 96 DF, p-value: < 2.2e-16
## GLM is the best model, RF and NNet can be skipped probably
m.train = eml$learner.model$super.model$learner.model$model
m.terms = eml$learner.model$super.model$learner.model$terms
quantregModel <- ranger::ranger(m.terms, m.train, importance="impurity", quantreg=TRUE, keep.inbag = TRUE)
## Estimate EML prediction error ----
#quantiles = c((1-.682)/2, 1-(1-.682)/2)
#pr.rf = forestError::quantForestError(rf, X.train=data.frame(x=x), X.test=data.frame(x = -100:200), Y.train=y, alpha = (1-(quantiles[2]-quantiles[1])))
## correction factor:
eml.MSE0 = matrixStats::rowSds(as.matrix(m.train[,all.vars(m.terms)[-1]]), na.rm=TRUE)^2
eml.MSE = deviance(eml$learner.model$super.model$learner.model)/df.residual(eml$learner.model$super.model$learner.model)
## mass-preservation of MSE
eml.cf = eml.MSE/mean(eml.MSE0, na.rm = TRUE)
pred = mlr::getStackedBaseLearnerPredictions(eml, newdata=data.frame(x = -100:200))
rf.sd = sqrt(matrixStats::rowSds(as.matrix(as.data.frame(pred)), na.rm=TRUE)^2 * eml.cf)
#pr.rf = forestError::quantForestError(quantregModel, X.train=m.train[,all.vars(m.terms)[-1]], X.test=as.data.frame(pred), Y.train=m.train[,all.vars(m.terms)[1]], alpha = (1-(quantiles[2]-quantiles[1])))
d$rp <- predict(rp, newdata = d)
d$rcs <- predict(m.rcs, newdata = d)
d$rf = predict(eml, newdata = d)$data$response
#d$rf = pr.rf$estimates$pred
## RMSE
rmse.lm <- round(rmse(y, predict(m)), 1)
#rmse.rf <- round(rmse(y, predict(rf)), 1)
#rmse.rf <- round(mean(sqrt(pr.rf$estimates$mspe)), 1)
rmse.rf <- round(sqrt(eml.MSE), 1)
rmse.rp <- round(rmse(y, predict(rp)), 1)
rmse.rcs <- round(rmse(y, predict(m.rcs)), 1)
#rf.all <- predict(rf, d, predict.all=TRUE)
#sds.rf <- apply(rf.all$individual, 1, sd)
# leg.txt <- sprintf("%s (%s)", c('lm', 'randomForest', 'rpart', 'lm + RCS'), c(rmse.lm, rmse.rf, rmse.rp, rmse.rcs))
leg.txt <- sprintf("%s (%s)", c('lm', 'Ensemble ML'), c(rmse.lm, rmse.rf))
par(mar = c(0, 0, 0, 0), fg = 'black', bg = 'white')
plot(y ~ x, xlim = c(-25, 125), ylim = c(-50, 150), type = 'n', axes = FALSE)
grid()
points(y ~ x, cex = 1, pch = 16, las = 1)
lines(lm ~ x, data = d, col = 2, lwd = 2)
lines(rf ~ x, data = d, col = 4, lwd = 2)
#lines(d$x,d$rf+sds.rf,lty=2,col=4)
#lines(d$x,d$rf-sds.rf,lty=2,col=4)
#lines(d$x, pr.rf$estimates$lower_0.318, lty=2,col=4)
#lines(d$x, pr.rf$estimates$upper_0.318, lty=2,col=4)
lines(d$x, d$rf+rmse.rf+rf.sd, lty=2, col=4)
lines(d$x, d$rf-(rmse.rf+rf.sd), lty=2, col=4)
# lines(rp ~ x, data = d, col = 3, lwd = 2)
# lines(rcs ~ x, data = d, col = 6, lwd = 2)
legend('bottom', legend = leg.txt, lwd = 2, lty = 1, col = c(2, 4, 3), horiz = TRUE, title = 'RMSE')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment