thengl/help-this-poor-RF-model.R

## help-this-poor-RF-model.R
## orignal script: https://gist.github.com/dylanbeaudette/b386f0008133167f518960a113283a0d#file-help-this-poor-rf-model-r
## discussion: https://twitter.com/DylanBeaudette/status/1410666900581851138

library(randomForest)
library(ranger)
library(rpart)
library(rms)
library(mlr)
library(forestError)
rmse <- function(a, b) {
  sqrt(mean((a - b)^2))
}

## Synthetic data ----
set.seed(200)
n = 100
x <- 1:n
y <- x + rnorm(n = 50, mean = 15, sd = 15)
# y <- x * sin(x/10) + rnorm(n = 50, mean = 15, sd = 15)
# plot(y ~ x)
m <- lm(y ~ x)
m.rcs <- ols(y ~ rcs(x))
#rf <- randomForest(y ~ x)
dat <- data.frame(x,y)
rf = randomForest::randomForest(data.frame(x=x), y, nodesize = 5, ntree = 100, keep.inbag = TRUE)
rp <- rpart(y ~ x)
## newdata
d <- data.frame(
  x = -100:200
)
d$lm <- predict(m, newdata = d)
#d$rf <- predict(rf, newdata = d)

## Fir Ensemble Model using mlr ----
lrns <- lapply(c("regr.ranger", "regr.glm", "regr.nnet"), mlr::makeLearner)
tsk <- mlr::makeRegrTask(data = dat, target = "y")
init.m <- mlr::makeStackedLearner(lrns, method = "stack.cv", super.learner = "regr.lm", resampling=mlr::makeResampleDesc(method = "CV"))
eml = train(init.m, tsk)
summary(eml$learner.model$super.model$learner.model)
# Residuals:
#   Min       1Q   Median       3Q      Max
# -27.9489 -10.6332  -0.3286  11.9130  29.3213
#
# Coefficients:
#   Estimate Std. Error t value Pr(>|t|)
# (Intercept)  0.25617    4.08073   0.063 0.950076
# regr.ranger -0.75309    0.18587  -4.052 0.000103 ***
#   regr.glm     1.73970    0.19991   8.703 9.08e-14 ***
#   regr.nnet    0.01188    0.10312   0.115 0.908515
# ---
#   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 13.84 on 96 degrees of freedom
# Multiple R-squared:  0.8141,	Adjusted R-squared:  0.8083
# F-statistic: 140.2 on 3 and 96 DF,  p-value: < 2.2e-16
## GLM is the best model, RF and NNet can be skipped probably
m.train = eml$learner.model$super.model$learner.model$model
m.terms = eml$learner.model$super.model$learner.model$terms
quantregModel <- ranger::ranger(m.terms, m.train, importance="impurity", quantreg=TRUE, keep.inbag = TRUE)

## Estimate EML prediction error ----
#quantiles = c((1-.682)/2, 1-(1-.682)/2)
#pr.rf = forestError::quantForestError(rf, X.train=data.frame(x=x), X.test=data.frame(x = -100:200), Y.train=y, alpha = (1-(quantiles[2]-quantiles[1])))

## correction factor:
eml.MSE0 = matrixStats::rowSds(as.matrix(m.train[,all.vars(m.terms)[-1]]), na.rm=TRUE)^2
eml.MSE = deviance(eml$learner.model$super.model$learner.model)/df.residual(eml$learner.model$super.model$learner.model)
## mass-preservation of MSE
eml.cf = eml.MSE/mean(eml.MSE0, na.rm = TRUE)
pred = mlr::getStackedBaseLearnerPredictions(eml, newdata=data.frame(x = -100:200))
rf.sd = sqrt(matrixStats::rowSds(as.matrix(as.data.frame(pred)), na.rm=TRUE)^2 * eml.cf)
#pr.rf = forestError::quantForestError(quantregModel, X.train=m.train[,all.vars(m.terms)[-1]], X.test=as.data.frame(pred), Y.train=m.train[,all.vars(m.terms)[1]], alpha = (1-(quantiles[2]-quantiles[1])))
d$rp <- predict(rp, newdata = d)
d$rcs <- predict(m.rcs, newdata = d)
d$rf = predict(eml, newdata = d)$data$response
#d$rf = pr.rf$estimates$pred

## RMSE
rmse.lm <- round(rmse(y, predict(m)), 1)
#rmse.rf <- round(rmse(y, predict(rf)), 1)
#rmse.rf <- round(mean(sqrt(pr.rf$estimates$mspe)), 1)
rmse.rf <- round(sqrt(eml.MSE), 1)
rmse.rp <- round(rmse(y, predict(rp)), 1)
rmse.rcs <- round(rmse(y, predict(m.rcs)), 1)

#rf.all <- predict(rf, d, predict.all=TRUE)
#sds.rf <-  apply(rf.all$individual, 1, sd)
# leg.txt <- sprintf("%s (%s)", c('lm', 'randomForest', 'rpart', 'lm + RCS'), c(rmse.lm, rmse.rf, rmse.rp, rmse.rcs))
leg.txt <- sprintf("%s (%s)", c('lm', 'Ensemble ML'), c(rmse.lm, rmse.rf))
par(mar = c(0, 0, 0, 0), fg = 'black', bg = 'white')
plot(y ~ x, xlim = c(-25, 125), ylim = c(-50, 150), type = 'n', axes = FALSE)
grid()
points(y ~ x, cex = 1, pch = 16, las = 1)
lines(lm ~ x, data = d, col = 2, lwd = 2)
lines(rf ~ x, data = d, col = 4, lwd = 2)
#lines(d$x,d$rf+sds.rf,lty=2,col=4)
#lines(d$x,d$rf-sds.rf,lty=2,col=4)
#lines(d$x, pr.rf$estimates$lower_0.318, lty=2,col=4)
#lines(d$x, pr.rf$estimates$upper_0.318, lty=2,col=4)
lines(d$x, d$rf+rmse.rf+rf.sd, lty=2, col=4)
lines(d$x, d$rf-(rmse.rf+rf.sd), lty=2, col=4)
# lines(rp ~ x, data = d, col = 3, lwd = 2)
# lines(rcs ~ x, data = d, col = 6, lwd = 2)
legend('bottom', legend = leg.txt, lwd = 2, lty = 1, col = c(2, 4, 3), horiz = TRUE, title = 'RMSE')
	## orignal script: https://gist.github.com/dylanbeaudette/b386f0008133167f518960a113283a0d#file-help-this-poor-rf-model-r
	## discussion: https://twitter.com/DylanBeaudette/status/1410666900581851138

	library(randomForest)
	library(ranger)
	library(rpart)
	library(rms)
	library(mlr)
	library(forestError)
	rmse <- function(a, b) {
	sqrt(mean((a - b)^2))
	}

	## Synthetic data ----
	set.seed(200)
	n = 100
	x <- 1:n
	y <- x + rnorm(n = 50, mean = 15, sd = 15)
	# y <- x * sin(x/10) + rnorm(n = 50, mean = 15, sd = 15)
	# plot(y ~ x)
	m <- lm(y ~ x)
	m.rcs <- ols(y ~ rcs(x))
	#rf <- randomForest(y ~ x)
	dat <- data.frame(x,y)
	rf = randomForest::randomForest(data.frame(x=x), y, nodesize = 5, ntree = 100, keep.inbag = TRUE)
	rp <- rpart(y ~ x)
	## newdata
	d <- data.frame(
	x = -100:200
	)
	d$lm <- predict(m, newdata = d)
	#d$rf <- predict(rf, newdata = d)

	## Fir Ensemble Model using mlr ----
	lrns <- lapply(c("regr.ranger", "regr.glm", "regr.nnet"), mlr::makeLearner)
	tsk <- mlr::makeRegrTask(data = dat, target = "y")
	init.m <- mlr::makeStackedLearner(lrns, method = "stack.cv", super.learner = "regr.lm", resampling=mlr::makeResampleDesc(method = "CV"))
	eml = train(init.m, tsk)
	summary(eml$learner.model$super.model$learner.model)
	# Residuals:
	# Min 1Q Median 3Q Max
	# -27.9489 -10.6332 -0.3286 11.9130 29.3213
	#
	# Coefficients:
	# Estimate Std. Error t value Pr(>\|t\|)
	# (Intercept) 0.25617 4.08073 0.063 0.950076
	# regr.ranger -0.75309 0.18587 -4.052 0.000103 ***
	# regr.glm 1.73970 0.19991 8.703 9.08e-14 ***
	# regr.nnet 0.01188 0.10312 0.115 0.908515
	# ---
	# Signif. codes: 0 ‘*’ 0.001 ‘’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
	#
	# Residual standard error: 13.84 on 96 degrees of freedom
	# Multiple R-squared: 0.8141, Adjusted R-squared: 0.8083
	# F-statistic: 140.2 on 3 and 96 DF, p-value: < 2.2e-16
	## GLM is the best model, RF and NNet can be skipped probably
	m.train = eml$learner.model$super.model$learner.model$model
	m.terms = eml$learner.model$super.model$learner.model$terms
	quantregModel <- ranger::ranger(m.terms, m.train, importance="impurity", quantreg=TRUE, keep.inbag = TRUE)

	## Estimate EML prediction error ----
	#quantiles = c((1-.682)/2, 1-(1-.682)/2)
	#pr.rf = forestError::quantForestError(rf, X.train=data.frame(x=x), X.test=data.frame(x = -100:200), Y.train=y, alpha = (1-(quantiles[2]-quantiles[1])))

	## correction factor:
	eml.MSE0 = matrixStats::rowSds(as.matrix(m.train[,all.vars(m.terms)[-1]]), na.rm=TRUE)^2
	eml.MSE = deviance(eml$learner.model$super.model$learner.model)/df.residual(eml$learner.model$super.model$learner.model)
	## mass-preservation of MSE
	eml.cf = eml.MSE/mean(eml.MSE0, na.rm = TRUE)
	pred = mlr::getStackedBaseLearnerPredictions(eml, newdata=data.frame(x = -100:200))
	rf.sd = sqrt(matrixStats::rowSds(as.matrix(as.data.frame(pred)), na.rm=TRUE)^2 * eml.cf)
	#pr.rf = forestError::quantForestError(quantregModel, X.train=m.train[,all.vars(m.terms)[-1]], X.test=as.data.frame(pred), Y.train=m.train[,all.vars(m.terms)[1]], alpha = (1-(quantiles[2]-quantiles[1])))
	d$rp <- predict(rp, newdata = d)
	d$rcs <- predict(m.rcs, newdata = d)
	d$rf = predict(eml, newdata = d)$data$response
	#d$rf = pr.rf$estimates$pred

	## RMSE
	rmse.lm <- round(rmse(y, predict(m)), 1)
	#rmse.rf <- round(rmse(y, predict(rf)), 1)
	#rmse.rf <- round(mean(sqrt(pr.rf$estimates$mspe)), 1)
	rmse.rf <- round(sqrt(eml.MSE), 1)
	rmse.rp <- round(rmse(y, predict(rp)), 1)
	rmse.rcs <- round(rmse(y, predict(m.rcs)), 1)

	#rf.all <- predict(rf, d, predict.all=TRUE)
	#sds.rf <- apply(rf.all$individual, 1, sd)
	# leg.txt <- sprintf("%s (%s)", c('lm', 'randomForest', 'rpart', 'lm + RCS'), c(rmse.lm, rmse.rf, rmse.rp, rmse.rcs))
	leg.txt <- sprintf("%s (%s)", c('lm', 'Ensemble ML'), c(rmse.lm, rmse.rf))
	par(mar = c(0, 0, 0, 0), fg = 'black', bg = 'white')
	plot(y ~ x, xlim = c(-25, 125), ylim = c(-50, 150), type = 'n', axes = FALSE)
	grid()
	points(y ~ x, cex = 1, pch = 16, las = 1)
	lines(lm ~ x, data = d, col = 2, lwd = 2)
	lines(rf ~ x, data = d, col = 4, lwd = 2)
	#lines(d$x,d$rf+sds.rf,lty=2,col=4)
	#lines(d$x,d$rf-sds.rf,lty=2,col=4)
	#lines(d$x, pr.rf$estimates$lower_0.318, lty=2,col=4)
	#lines(d$x, pr.rf$estimates$upper_0.318, lty=2,col=4)
	lines(d$x, d$rf+rmse.rf+rf.sd, lty=2, col=4)
	lines(d$x, d$rf-(rmse.rf+rf.sd), lty=2, col=4)
	# lines(rp ~ x, data = d, col = 3, lwd = 2)
	# lines(rcs ~ x, data = d, col = 6, lwd = 2)
	legend('bottom', legend = leg.txt, lwd = 2, lty = 1, col = c(2, 4, 3), horiz = TRUE, title = 'RMSE')