Skip to content

Instantly share code, notes, and snippets.

@oliviergimenez
Created December 29, 2020 08:41
Show Gist options
  • Save oliviergimenez/cbece2f0d598fc51cbe00d35f31a085a to your computer and use it in GitHub Desktop.
Save oliviergimenez/cbece2f0d598fc51cbe00d35f31a085a to your computer and use it in GitHub Desktop.
Grouped lasso with both discrete/continuous predictors and normal/non-normal response
# illustrates grouped lasso with simulations
# with normal and non-normal responses
# and a mix and discrete/continuous predictors
#----- normal response
library(gglasso)
create_factor <- function(nb_lvl, n = 100 ){
factor(sample(letters[1:nb_lvl],n, replace = TRUE))}
df <- data.frame(var1 = create_factor(5),
var2 = create_factor(5),
var3 = create_factor(5),
var4 = create_factor(5),
var5 = rnorm(100),
y = rnorm(100))
# no effect
y <- df$y
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
groups <- c(rep(1:4, each = 4), 5)
fit <- gglasso(x = x, y = y, group = groups)
plot(fit)
# effect of continuous var5
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
y <- 1 + x[,'var5'] + df$y
groups <- c(rep(1:4, each = 4), 5)
fit <- gglasso(x = x, y = y, group = groups)
plot(fit)
# effect of discrete var1
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
y <- 1 + x[,1:4] %*% c(20,20,20,20) + df$y
groups <- c(rep(1:4, each = 4), 5)
fit <- gglasso(x = x, y = y, group = groups)
plot(fit)
# effect of both var1 and var5
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
y <- 1 + x[,1:4] %*% c(2,2,2,2) + x[,'var5'] + df$y
groups <- c(rep(1:4, each = 4), 5)
fit <- gglasso(x = x, y = y, group = groups)
plot(fit)
# pick optimal lambda
opt <- cv.gglasso(x = x, y = y, group = groups, nfolds = 10)
plot(opt)
plot(fit)
abline(v = log(opt$lambda.min), lty = 2)
abline(v = log(opt$lambda.1se), lty = 2)
legend("bottomright",
lwd = 1,
col = rainbow(5 + 1, start = 0.7, end = 0.95)[1:5],
legend = paste0("var", 1:5),
cex = .7)
# R2
y_hat <- cbind(1, x) %*% c(1, gglasso(x = x, y = y, group = groups, lambda = opt$lambda.min)$beta)
(rsq <- cor(y, y_hat)^2)
#--- idem for Poisson regression
create_factor <- function(nb_lvl, n = 100 ){
factor(sample(letters[1:nb_lvl],n, replace = TRUE))}
df <- data.frame(var1 = create_factor(5),
var2 = create_factor(5),
var3 = create_factor(5),
var4 = create_factor(5),
var5 = rnorm(mean = 1, sd = 1, n = 100),
y = rpois(100, 1))
# no effect
y <- df$y
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
groups <- c(rep(1:4, each = 4), 5)
fit <- gglasso(x = x, y = y, group = groups)
plot(fit)
# effect of continuous var5
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
y <- rpois(100, exp(1 + x[,'var5']))
groups <- c(rep(1:4, each = 4), 5)
fit <- gglasso(x = x, y = y, group = groups)
plot(fit)
# effect of discrete var1
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
rate <- 1 + x[,1:4] %*% c(20,20,20,20)
y <- rpois(100, exp(rate))
groups <- c(rep(1:4, each = 4), 5)
fit <- gglasso(x = x, y = y, group = groups)
plot(fit)
# effect of both var1 and var5
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
rate <- 1 + x[,1:4] %*% c(2,2,2,2) + x[,'var5']
y <- rpois(100, exp(rate))
groups <- c(rep(1:4, each = 4), 5)
fit <- gglasso(x = x, y = y, group = groups)
plot(fit)
# pick optimal lambda
opt <- cv.gglasso(x = x,
y = y,
group = groups,
nfolds = 20)
plot(opt)
fit <- gglasso(x = x,
y = y,
group = groups)
plot(fit)
abline(v = log(opt$lambda.min), lty = 2)
abline(v = log(opt$lambda.1se), lty = 2)
legend("bottomright",
lwd = 1,
col = rainbow(5 + 1, start = 0.7, end = 0.95)[1:5],
legend = paste0("var", 1:5),
cex = .7)
# seems to work, despite assuming response variable is normal
# use appropriate poisson family for response
library(grpreg)
opt <- cv.grpreg(X = x,
y = y,
group = groups,
penalty = "grLasso",
nfolds = 20,
se = "bootstrap")
opt$lambda.min
fit <- grpreg(X = x,
y = y,
group = groups,
penalty = "grLasso")
# by group
plot(x = fit,
log.l = TRUE, # lambda on log scale if TRUE
alpha = 1, # transparency
norm = TRUE, # by group if TRUE, ind coefficients otherwise
legend.loc = "topleft")
abline(v = log(opt$lambda.min), lty = 2)
# by individual coefficients
plot(x = fit,
log.l = TRUE, # lambda on log scale if TRUE
alpha = 1, # transparency
norm = FALSE, # by group if TRUE, ind coefficients otherwise
legend.loc = "topleft")
abline(v = log(opt$lambda.min), lty = 2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment