/boosting.r Secret
Last active
November 9, 2016 14:56
Star
You must be signed in to star a gist
Small boosting demonstration for simple linear regression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Little boosting algorithm demonstration | |
# Base learner: simple OLS regression with 1 predictor | |
# Preamble ---------------------------------------------------------------- | |
rm(list = ls()) | |
set.seed(3665364) | |
# create loss function: l2 loss (least square error) | |
lossfunc <- function(y, yhat){ | |
(y - yhat)%*%(y - yhat) | |
} | |
# Generate dataset -------------------------------------------------------- | |
# predictors are a matrix of 49 slightly (0.1) correlated variables | |
# true coefficients are a random ordering of 0.5, 0.1, 0.9 and -0.3 | |
# assess model performance | |
# error is ~N(0,3) | |
library(mvtnorm) | |
sigma <- matrix(data = 0.1, nrow = 49, ncol = 49) | |
diag(sigma) <- 1 | |
X <- cbind(rep(1,100),rmvnorm(100, sigma = sigma)) | |
truecoef <- sample(rep(c(-0.3,0.5,0.1,0.9,0),10)) | |
y <- as.vector(truecoef%*%t(X)+rnorm(100, sd = 3)) | |
# GOAL: predict y with X | |
# Boosting ---------------------------------------------------------------- | |
# Algorithm hyperparameters | |
rate <- 0.1 # learning rate. Protects against overfitting | |
tolerance <- 0.0001 # sequential loss delta required to stop the algorithm | |
maxiter <- 1000 | |
# Initialisation stage (first iteration of algorithm with only intercept) | |
coef <- numeric(ncol(X)) # We want to estimate 50 coefficients: this vector | |
lossplot <- numeric(0) # We want to plot the loss progress | |
fit <- lm(y~1) # fit intercept-only model | |
coef[1] <- unname(coef(fit)) # save coefficient | |
yres <- y - predict(fit) # calculate residuals (equivalent to the | |
# the gradient associated with l2 loss) | |
loss_tot <- as.numeric(lossfunc(y, predict(fit))) # loss of the whole model | |
delta <- tolerance + 1 # initialise delta to something larger than tolerance | |
j <- 1 # iteration counter | |
# Start the algorithm | |
while (delta > tolerance && j < maxiter){ | |
# Calculate the loss of each of the 50 possible models wrt current residual | |
loss <- numeric(ncol(X)) | |
for (i in 1:ncol(X)){ | |
loss[i] <- lossfunc(yres, predict(lm(y~0+X[,i]))) | |
} | |
# Select the model with the lowest loss | |
best <- which(loss==min(loss)) | |
# Add this model to the overall model (with learning-rate multiplication) | |
coef[best] <- coef[best]+rate*unname(coef(lm(y~0+X[,best]))) | |
# Calculate new "gradient" (residuals of the total model) | |
yhat <- as.vector(coef%*%t(X)) | |
yres <- y - yhat | |
# Calculate delta (previous squared residuals versus current sqrd residuals) | |
delta <- loss_tot - lossfunc(y,yhat) | |
loss_tot <- lossfunc(y,yhat) | |
lossplot[j] <- loss_tot # for plotting | |
# End iteration | |
j <- j + 1 | |
cat("Iteration:",j, " loss:", loss_tot, "\n") # verbose | |
} | |
plot(lossplot, type = "l") | |
# Is boosting better than ols regression? --------------------------------- | |
# Squared deviation of the linmod coefficients from the true coefficients | |
linmod <- lm(y~X[,-1]*X[,-1]) # ols regression | |
lossfunc(truecoef, coef(linmod)) | |
# absolute deviation | |
sum(abs(truecoef-coef(linmod))) | |
# Squared deviation of the boosting coefficients from the true coefficients | |
lossfunc(truecoef, coef) | |
# absolute deviation | |
sum(abs(truecoef-coef)) | |
# Boosting performs better in this particular data. Interesting: notice that | |
# boosting coefficients are sparse (some are 0) -> variable selection |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment