Created
December 29, 2020 08:41
-
-
Save oliviergimenez/cbece2f0d598fc51cbe00d35f31a085a to your computer and use it in GitHub Desktop.
Grouped lasso with both discrete/continuous predictors and normal/non-normal response
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# illustrates grouped lasso with simulations | |
# with normal and non-normal responses | |
# and a mix and discrete/continuous predictors | |
#----- normal response | |
library(gglasso) | |
create_factor <- function(nb_lvl, n = 100 ){ | |
factor(sample(letters[1:nb_lvl],n, replace = TRUE))} | |
df <- data.frame(var1 = create_factor(5), | |
var2 = create_factor(5), | |
var3 = create_factor(5), | |
var4 = create_factor(5), | |
var5 = rnorm(100), | |
y = rnorm(100)) | |
# no effect | |
y <- df$y | |
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1] | |
groups <- c(rep(1:4, each = 4), 5) | |
fit <- gglasso(x = x, y = y, group = groups) | |
plot(fit) | |
# effect of continuous var5 | |
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1] | |
y <- 1 + x[,'var5'] + df$y | |
groups <- c(rep(1:4, each = 4), 5) | |
fit <- gglasso(x = x, y = y, group = groups) | |
plot(fit) | |
# effect of discrete var1 | |
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1] | |
y <- 1 + x[,1:4] %*% c(20,20,20,20) + df$y | |
groups <- c(rep(1:4, each = 4), 5) | |
fit <- gglasso(x = x, y = y, group = groups) | |
plot(fit) | |
# effect of both var1 and var5 | |
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1] | |
y <- 1 + x[,1:4] %*% c(2,2,2,2) + x[,'var5'] + df$y | |
groups <- c(rep(1:4, each = 4), 5) | |
fit <- gglasso(x = x, y = y, group = groups) | |
plot(fit) | |
# pick optimal lambda | |
opt <- cv.gglasso(x = x, y = y, group = groups, nfolds = 10) | |
plot(opt) | |
plot(fit) | |
abline(v = log(opt$lambda.min), lty = 2) | |
abline(v = log(opt$lambda.1se), lty = 2) | |
legend("bottomright", | |
lwd = 1, | |
col = rainbow(5 + 1, start = 0.7, end = 0.95)[1:5], | |
legend = paste0("var", 1:5), | |
cex = .7) | |
# R2 | |
y_hat <- cbind(1, x) %*% c(1, gglasso(x = x, y = y, group = groups, lambda = opt$lambda.min)$beta) | |
(rsq <- cor(y, y_hat)^2) | |
#--- idem for Poisson regression | |
create_factor <- function(nb_lvl, n = 100 ){ | |
factor(sample(letters[1:nb_lvl],n, replace = TRUE))} | |
df <- data.frame(var1 = create_factor(5), | |
var2 = create_factor(5), | |
var3 = create_factor(5), | |
var4 = create_factor(5), | |
var5 = rnorm(mean = 1, sd = 1, n = 100), | |
y = rpois(100, 1)) | |
# no effect | |
y <- df$y | |
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1] | |
groups <- c(rep(1:4, each = 4), 5) | |
fit <- gglasso(x = x, y = y, group = groups) | |
plot(fit) | |
# effect of continuous var5 | |
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1] | |
y <- rpois(100, exp(1 + x[,'var5'])) | |
groups <- c(rep(1:4, each = 4), 5) | |
fit <- gglasso(x = x, y = y, group = groups) | |
plot(fit) | |
# effect of discrete var1 | |
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1] | |
rate <- 1 + x[,1:4] %*% c(20,20,20,20) | |
y <- rpois(100, exp(rate)) | |
groups <- c(rep(1:4, each = 4), 5) | |
fit <- gglasso(x = x, y = y, group = groups) | |
plot(fit) | |
# effect of both var1 and var5 | |
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1] | |
rate <- 1 + x[,1:4] %*% c(2,2,2,2) + x[,'var5'] | |
y <- rpois(100, exp(rate)) | |
groups <- c(rep(1:4, each = 4), 5) | |
fit <- gglasso(x = x, y = y, group = groups) | |
plot(fit) | |
# pick optimal lambda | |
opt <- cv.gglasso(x = x, | |
y = y, | |
group = groups, | |
nfolds = 20) | |
plot(opt) | |
fit <- gglasso(x = x, | |
y = y, | |
group = groups) | |
plot(fit) | |
abline(v = log(opt$lambda.min), lty = 2) | |
abline(v = log(opt$lambda.1se), lty = 2) | |
legend("bottomright", | |
lwd = 1, | |
col = rainbow(5 + 1, start = 0.7, end = 0.95)[1:5], | |
legend = paste0("var", 1:5), | |
cex = .7) | |
# seems to work, despite assuming response variable is normal | |
# use appropriate poisson family for response | |
library(grpreg) | |
opt <- cv.grpreg(X = x, | |
y = y, | |
group = groups, | |
penalty = "grLasso", | |
nfolds = 20, | |
se = "bootstrap") | |
opt$lambda.min | |
fit <- grpreg(X = x, | |
y = y, | |
group = groups, | |
penalty = "grLasso") | |
# by group | |
plot(x = fit, | |
log.l = TRUE, # lambda on log scale if TRUE | |
alpha = 1, # transparency | |
norm = TRUE, # by group if TRUE, ind coefficients otherwise | |
legend.loc = "topleft") | |
abline(v = log(opt$lambda.min), lty = 2) | |
# by individual coefficients | |
plot(x = fit, | |
log.l = TRUE, # lambda on log scale if TRUE | |
alpha = 1, # transparency | |
norm = FALSE, # by group if TRUE, ind coefficients otherwise | |
legend.loc = "topleft") | |
abline(v = log(opt$lambda.min), lty = 2) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment