Skip to content

Instantly share code, notes, and snippets.

@vankesteren
Last active November 9, 2016 14:56
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save vankesteren/b69f3c9cc9dbdeb2fca7dc0ca262cc46 to your computer and use it in GitHub Desktop.
Small boosting demonstration for simple linear regression
# Little boosting algorithm demonstration
# Base learner: simple OLS regression with 1 predictor
# Preamble ----------------------------------------------------------------
rm(list = ls())
set.seed(3665364)
# create loss function: l2 loss (least square error)
lossfunc <- function(y, yhat){
(y - yhat)%*%(y - yhat)
}
# Generate dataset --------------------------------------------------------
# predictors are a matrix of 49 slightly (0.1) correlated variables
# true coefficients are a random ordering of 0.5, 0.1, 0.9 and -0.3
# assess model performance
# error is ~N(0,3)
library(mvtnorm)
sigma <- matrix(data = 0.1, nrow = 49, ncol = 49)
diag(sigma) <- 1
X <- cbind(rep(1,100),rmvnorm(100, sigma = sigma))
truecoef <- sample(rep(c(-0.3,0.5,0.1,0.9,0),10))
y <- as.vector(truecoef%*%t(X)+rnorm(100, sd = 3))
# GOAL: predict y with X
# Boosting ----------------------------------------------------------------
# Algorithm hyperparameters
rate <- 0.1 # learning rate. Protects against overfitting
tolerance <- 0.0001 # sequential loss delta required to stop the algorithm
maxiter <- 1000
# Initialisation stage (first iteration of algorithm with only intercept)
coef <- numeric(ncol(X)) # We want to estimate 50 coefficients: this vector
lossplot <- numeric(0) # We want to plot the loss progress
fit <- lm(y~1) # fit intercept-only model
coef[1] <- unname(coef(fit)) # save coefficient
yres <- y - predict(fit) # calculate residuals (equivalent to the
# the gradient associated with l2 loss)
loss_tot <- as.numeric(lossfunc(y, predict(fit))) # loss of the whole model
delta <- tolerance + 1 # initialise delta to something larger than tolerance
j <- 1 # iteration counter
# Start the algorithm
while (delta > tolerance && j < maxiter){
# Calculate the loss of each of the 50 possible models wrt current residual
loss <- numeric(ncol(X))
for (i in 1:ncol(X)){
loss[i] <- lossfunc(yres, predict(lm(y~0+X[,i])))
}
# Select the model with the lowest loss
best <- which(loss==min(loss))
# Add this model to the overall model (with learning-rate multiplication)
coef[best] <- coef[best]+rate*unname(coef(lm(y~0+X[,best])))
# Calculate new "gradient" (residuals of the total model)
yhat <- as.vector(coef%*%t(X))
yres <- y - yhat
# Calculate delta (previous squared residuals versus current sqrd residuals)
delta <- loss_tot - lossfunc(y,yhat)
loss_tot <- lossfunc(y,yhat)
lossplot[j] <- loss_tot # for plotting
# End iteration
j <- j + 1
cat("Iteration:",j, " loss:", loss_tot, "\n") # verbose
}
plot(lossplot, type = "l")
# Is boosting better than ols regression? ---------------------------------
# Squared deviation of the linmod coefficients from the true coefficients
linmod <- lm(y~X[,-1]*X[,-1]) # ols regression
lossfunc(truecoef, coef(linmod))
# absolute deviation
sum(abs(truecoef-coef(linmod)))
# Squared deviation of the boosting coefficients from the true coefficients
lossfunc(truecoef, coef)
# absolute deviation
sum(abs(truecoef-coef))
# Boosting performs better in this particular data. Interesting: notice that
# boosting coefficients are sparse (some are 0) -> variable selection
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment