Skip to content

Instantly share code, notes, and snippets.

@thengl
Last active May 22, 2023 12:18
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save thengl/eafb19b99b20042c689c9ea899435f57 to your computer and use it in GitHub Desktop.
Using Machine Learning to help decrease over-fitting and extrapolation problems in regression modeling
## original script: https://gist.github.com/dylanbeaudette/b386f0008133167f518960a113283a0d#file-help-this-poor-rf-model-r
## discussion: https://twitter.com/DylanBeaudette/status/1410666900581851138
## by: tom.hengl@opengeohub.org
library(rpart)
library(rms)
library(ranger)
library(kernlab)
library(mboost)
library(landmap)
library(mlr)
library(forestError)
rmse <- function(a, b) {
sqrt(mean((a - b)^2))
}
## Synthetic data ----
set.seed(200)
n = 100
x <- 1:n
y <- x + rnorm(n = 50, mean = 15, sd = 15)
## Synthetic linear model ----
m0 <- lm(y ~ x)
summary(m0)
# Residuals:
# Min 1Q Median 3Q Max
# -27.6093 -6.4396 0.6437 6.7742 26.7192
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 14.84655 2.37953 6.239 1.12e-08 ***
# x 0.97910 0.04091 23.934 < 2e-16 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 11.81 on 98 degrees of freedom
# Multiple R-squared: 0.8539, Adjusted R-squared: 0.8524
# F-statistic: 572.9 on 1 and 98 DF, p-value: < 2.2e-16
dat <- data.frame(x,y)
## newdata:
newdata <- data.frame(
x = -100:200
)
newdata$y.lm <- predict(m0, newdata = newdata)
## Synthetic RF ----
rf = randomForest::randomForest(data.frame(x=x), y, nodesize = 5, ntree = 100, keep.inbag = TRUE)
quantiles = c((1-.682)/2, 1-(1-.682)/2)
## prediction error from forestError:
pr.rf = forestError::quantForestError(rf, X.train=data.frame(x=x), X.test=data.frame(x = -100:200), Y.train=y, alpha = (1-(quantiles[2]-quantiles[1])))
newdata$y.rf <- predict(rf, newdata = newdata)
## RMSE
rmse.lm <- round(rmse(y, predict(m0)), 1)
rmse.rf <- round(rmse(y, predict(rf)), 1)
leg.txt <- sprintf("%s (%s)", c('lm', 'RF'), c(rmse.lm, rmse.rf))
par(mar = c(0, 0, 0, 0), fg = 'black', bg = 'white')
plot(y ~ x, xlim = c(-25, 125), ylim = c(-50, 150), type = 'n', axes = FALSE)
grid()
points(y ~ x, cex = 1, pch = 16, las = 1)
lines(y.lm ~ x, data = newdata, col = 2, lwd = 2)
lines(y.rf ~ x, data = newdata, col = 4, lwd = 2)
lines(newdata$x, pr.rf$estimates$lower_0.318, lty=2,col=4)
lines(newdata$x, pr.rf$estimates$upper_0.318, lty=2,col=4)
legend('bottom', legend = leg.txt, lwd = 2, lty = 1, col = c(2, 4, 3), horiz = TRUE, title = 'RMSE')
## Fir Ensemble ML using mlr ----
SL.library = c("regr.ranger", "regr.glm", "regr.gamboost", "regr.ksvm")
lrns <- lapply(SL.library, mlr::makeLearner)
tsk <- mlr::makeRegrTask(data = dat, target = "y")
init.m <- mlr::makeStackedLearner(lrns, method = "stack.cv", super.learner = "regr.lm", resampling=mlr::makeResampleDesc(method = "CV"))
eml = train(init.m, tsk)
summary(eml$learner.model$super.model$learner.model)
# Residuals:
# Min 1Q Median 3Q Max
# -25.8665 -7.1403 0.0125 6.9210 27.7502
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -2.3088 2.8341 -0.815 0.4173
# regr.ranger -0.4184 0.2201 -1.901 0.0603 .
# regr.glm 4.7697 0.9230 5.168 1.31e-06 ***
# regr.gamboost -4.1556 1.0181 -4.082 9.31e-05 ***
# regr.ksvm 0.8295 0.3419 2.426 0.0171 *
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 10.89 on 95 degrees of freedom
# Multiple R-squared: 0.8796, Adjusted R-squared: 0.8745
# F-statistic: 173.5 on 4 and 95 DF, p-value: < 2.2e-16
## GLM is the best model, RF and kSVM are on the edge
## Synthetic prediction error ----
m.train = eml$learner.model$super.model$learner.model
m.terms = eml$learner.model$super.model$learner.model$terms
newdata$y.eml = predict(eml, newdata = newdata)$data$response
eml.MSE0 = matrixStats::rowSds(as.matrix(m.train$model[,all.vars(m.terms)[-1]]), na.rm=TRUE)^2
eml.MSE = deviance(m.train)/df.residual(m.train)
## correction factor / mass-preservation of MSE
eml.cf = eml.MSE/mean(eml.MSE0, na.rm = TRUE)
eml.cf
pred = mlr::getStackedBaseLearnerPredictions(eml, newdata=data.frame(x = -100:200))
rf.sd = sqrt(matrixStats::rowSds(as.matrix(as.data.frame(pred)), na.rm=TRUE)^2 * eml.cf)
rmse.eml <- round(sqrt(eml.MSE), 1)
## Plot confidence interval:
leg.txt <- sprintf("%s (%s)", c('lm', 'EML'), c(rmse.lm, rmse.eml))
par(mar = c(0, 0, 0, 0), fg = 'black', bg = 'white')
plot(y ~ x, xlim = c(-25, 125), ylim = c(-50, 150), type = 'n', axes = FALSE)
grid()
points(y ~ x, cex = 1, pch = 16, las = 1)
lines(y.lm ~ x, data = newdata, col = 2, lwd = 2)
lines(y.eml ~ x, data = newdata, col = 4, lwd = 2)
lines(newdata$x, newdata$y.eml+rmse.eml+rf.sd, lty=2, col=4)
lines(newdata$x, newdata$y.eml-(rmse.eml+rf.sd), lty=2, col=4)
legend('bottom', legend = leg.txt, lwd = 2, lty = 1, col = c(2, 4, 3), horiz = TRUE, title = 'RMSE')
## Meuse data set ----
library(rgdal)
demo(meuse, echo=FALSE)
plot(meuse.grid["dist"])
points(meuse, pch=20, col="white")
dev.off()
# select learners
SL2.library = c("regr.ranger", "regr.cubist", "regr.gamboost")
meuse.lrns <- lapply(SL2.library, mlr::makeLearner)
meuse.dat = meuse@data[,c("zinc","dist")]
meuse.tsk <- mlr::makeRegrTask(data = meuse.dat, target = "zinc")
meuse.init.m <- mlr::makeStackedLearner(meuse.lrns, method = "stack.cv", super.learner = "regr.lm", resampling=mlr::makeResampleDesc(method = "CV"))
eml.m <- train(meuse.init.m, meuse.tsk)
summary(eml.m$learner.model$super.model$learner.model)
## compare with linear model:
lm.m0 <- lm(zinc ~ dist, meuse@data)
summary(lm.m0)
## R-square 0.41
rmse2.lm <- round(rmse(meuse$zinc, predict(lm.m0)), 1)
## Meuse prediction errors ----
meuse.train = eml.m$learner.model$super.model$learner.model
meuse.terms = eml.m$learner.model$super.model$learner.model$terms
summary(meuse$dist)
m.newdata = data.frame(dist=(0:120)/100)
m.newdata$y.eml = predict(eml.m, newdata = m.newdata)$data$response
m.eml.MSE0 = matrixStats::rowSds(as.matrix(meuse.train$model[,all.vars(meuse.terms)[-1]]), na.rm=TRUE)^2
m.eml.MSE = deviance(meuse.train)/df.residual(meuse.train)
## correction factor / mass-preservation of MSE
m.eml.cf = m.eml.MSE/mean(m.eml.MSE0, na.rm = TRUE)
m.eml.cf
m.pred = mlr::getStackedBaseLearnerPredictions(eml.m, newdata=m.newdata)
m.rf.sd = sqrt(matrixStats::rowSds(as.matrix(as.data.frame(m.pred)), na.rm=TRUE)^2 * m.eml.cf)
m.rmse.eml <- round(sqrt(m.eml.MSE), 1)
## Plot confidence interval:
leg.txt <- sprintf("%s (%s)", c('lm', 'EML'), c(rmse2.lm, m.rmse.eml))
par(fg = 'black', bg = 'white') ## mar = c(0, 0, 0, 0)
plot(zinc ~ dist, meuse@data, xlim = c(0, 1.2), ylim = c(100, 1950)) #type = 'n'
grid()
points(zinc ~ dist, meuse@data, cex = 1, pch = 16, las = 1)
#lines(y.lm ~ dist, data = m.newdata, col = 2, lwd = 2)
lines(y.eml ~ dist, data = m.newdata, col = 4, lwd = 2)
lines(m.newdata$dist, m.newdata$y.eml+m.rmse.eml+m.rf.sd, lty=2, col=4)
lines(m.newdata$dist, m.newdata$y.eml-(m.rmse.eml+m.rf.sd), lty=2, col=4)
legend('top', legend = leg.txt, lwd = 2, lty = 1, col = c(2, 4, 3), horiz = TRUE, title = 'RMSE')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment