oliviergimenez/grouped_lasso.R

## grouped_lasso.R
# illustrates grouped lasso with simulations
# with normal and non-normal responses
# and a mix and discrete/continuous predictors

#----- normal response

library(gglasso)
create_factor <- function(nb_lvl, n = 100 ){
  factor(sample(letters[1:nb_lvl],n, replace = TRUE))}

df <- data.frame(var1 = create_factor(5),
                 var2 = create_factor(5),
                 var3 = create_factor(5),
                 var4 = create_factor(5),
                 var5 = rnorm(100),
                 y = rnorm(100))

# no effect
y <- df$y
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
groups <- c(rep(1:4, each = 4), 5)
fit <- gglasso(x = x, y = y, group = groups)
plot(fit)

# effect of continuous var5
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
y <- 1 + x[,'var5'] + df$y
groups <- c(rep(1:4, each = 4), 5)
fit <- gglasso(x = x, y = y, group = groups)
plot(fit)

# effect of discrete var1
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
y <- 1 + x[,1:4] %*% c(20,20,20,20) + df$y
groups <- c(rep(1:4, each = 4), 5)
fit <- gglasso(x = x, y = y, group = groups)
plot(fit)

# effect of both var1 and var5
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
y <- 1 + x[,1:4] %*% c(2,2,2,2) + x[,'var5'] + df$y
groups <- c(rep(1:4, each = 4), 5)
fit <- gglasso(x = x, y = y, group = groups)
plot(fit)

# pick optimal lambda
opt <- cv.gglasso(x = x, y = y, group = groups, nfolds = 10)
plot(opt)
plot(fit)
abline(v = log(opt$lambda.min), lty = 2)
abline(v = log(opt$lambda.1se), lty = 2)
legend("bottomright",
       lwd = 1,
       col = rainbow(5 + 1, start = 0.7, end = 0.95)[1:5],
       legend = paste0("var", 1:5),
       cex = .7)

# R2
y_hat <- cbind(1, x) %*% c(1, gglasso(x = x, y = y, group = groups, lambda = opt$lambda.min)$beta)
(rsq <- cor(y, y_hat)^2)


#--- idem for Poisson regression


create_factor <- function(nb_lvl, n = 100 ){
  factor(sample(letters[1:nb_lvl],n, replace = TRUE))}

df <- data.frame(var1 = create_factor(5),
                 var2 = create_factor(5),
                 var3 = create_factor(5),
                 var4 = create_factor(5),
                 var5 = rnorm(mean = 1, sd = 1, n = 100),
                 y = rpois(100, 1))

# no effect
y <- df$y
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
groups <- c(rep(1:4, each = 4), 5)
fit <- gglasso(x = x, y = y, group = groups)
plot(fit)

# effect of continuous var5
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
y <- rpois(100, exp(1 + x[,'var5']))
groups <- c(rep(1:4, each = 4), 5)
fit <- gglasso(x = x, y = y, group = groups)
plot(fit)

# effect of discrete var1
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
rate <- 1 + x[,1:4] %*% c(20,20,20,20)
y <- rpois(100, exp(rate))
groups <- c(rep(1:4, each = 4), 5)
fit <- gglasso(x = x, y = y, group = groups)
plot(fit)

# effect of both var1 and var5
x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
rate <- 1 + x[,1:4] %*% c(2,2,2,2) + x[,'var5']
y <- rpois(100, exp(rate))
groups <- c(rep(1:4, each = 4), 5)
fit <- gglasso(x = x, y = y, group = groups)
plot(fit)

# pick optimal lambda
opt <- cv.gglasso(x = x,
                  y = y,
                  group = groups,
                  nfolds = 20)
plot(opt)
fit <- gglasso(x = x,
               y = y,
               group = groups)
plot(fit)
abline(v = log(opt$lambda.min), lty = 2)
abline(v = log(opt$lambda.1se), lty = 2)
legend("bottomright",
       lwd = 1,
       col = rainbow(5 + 1, start = 0.7, end = 0.95)[1:5],
       legend = paste0("var", 1:5),
       cex = .7)

# seems to work, despite assuming response variable is normal

# use appropriate poisson family for response

library(grpreg)
opt <- cv.grpreg(X = x,
              y = y,
              group = groups,
              penalty = "grLasso",
              nfolds = 20,
              se = "bootstrap")
opt$lambda.min

fit <- grpreg(X = x,
              y = y,
              group = groups,
              penalty = "grLasso")

# by group
plot(x = fit,
     log.l = TRUE, # lambda on log scale if TRUE
     alpha = 1, # transparency
     norm = TRUE, # by group if TRUE, ind coefficients otherwise
     legend.loc = "topleft")
abline(v = log(opt$lambda.min), lty = 2)

# by individual coefficients
plot(x = fit,
     log.l = TRUE, # lambda on log scale if TRUE
     alpha = 1, # transparency
     norm = FALSE, # by group if TRUE, ind coefficients otherwise
     legend.loc = "topleft")
abline(v = log(opt$lambda.min), lty = 2)
	# illustrates grouped lasso with simulations
	# with normal and non-normal responses
	# and a mix and discrete/continuous predictors

	#----- normal response

	library(gglasso)
	create_factor <- function(nb_lvl, n = 100 ){
	factor(sample(letters[1:nb_lvl],n, replace = TRUE))}

	df <- data.frame(var1 = create_factor(5),
	var2 = create_factor(5),
	var3 = create_factor(5),
	var4 = create_factor(5),
	var5 = rnorm(100),
	y = rnorm(100))

	# no effect
	y <- df$y
	x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
	groups <- c(rep(1:4, each = 4), 5)
	fit <- gglasso(x = x, y = y, group = groups)
	plot(fit)

	# effect of continuous var5
	x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
	y <- 1 + x[,'var5'] + df$y
	groups <- c(rep(1:4, each = 4), 5)
	fit <- gglasso(x = x, y = y, group = groups)
	plot(fit)

	# effect of discrete var1
	x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
	y <- 1 + x[,1:4] %*% c(20,20,20,20) + df$y
	groups <- c(rep(1:4, each = 4), 5)
	fit <- gglasso(x = x, y = y, group = groups)
	plot(fit)

	# effect of both var1 and var5
	x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
	y <- 1 + x[,1:4] %*% c(2,2,2,2) + x[,'var5'] + df$y
	groups <- c(rep(1:4, each = 4), 5)
	fit <- gglasso(x = x, y = y, group = groups)
	plot(fit)

	# pick optimal lambda
	opt <- cv.gglasso(x = x, y = y, group = groups, nfolds = 10)
	plot(opt)
	plot(fit)
	abline(v = log(opt$lambda.min), lty = 2)
	abline(v = log(opt$lambda.1se), lty = 2)
	legend("bottomright",
	lwd = 1,
	col = rainbow(5 + 1, start = 0.7, end = 0.95)[1:5],
	legend = paste0("var", 1:5),
	cex = .7)

	# R2
	y_hat <- cbind(1, x) %*% c(1, gglasso(x = x, y = y, group = groups, lambda = opt$lambda.min)$beta)
	(rsq <- cor(y, y_hat)^2)



	#--- idem for Poisson regression


	create_factor <- function(nb_lvl, n = 100 ){
	factor(sample(letters[1:nb_lvl],n, replace = TRUE))}

	df <- data.frame(var1 = create_factor(5),
	var2 = create_factor(5),
	var3 = create_factor(5),
	var4 = create_factor(5),
	var5 = rnorm(mean = 1, sd = 1, n = 100),
	y = rpois(100, 1))

	# no effect
	y <- df$y
	x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
	groups <- c(rep(1:4, each = 4), 5)
	fit <- gglasso(x = x, y = y, group = groups)
	plot(fit)

	# effect of continuous var5
	x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
	y <- rpois(100, exp(1 + x[,'var5']))
	groups <- c(rep(1:4, each = 4), 5)
	fit <- gglasso(x = x, y = y, group = groups)
	plot(fit)

	# effect of discrete var1
	x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
	rate <- 1 + x[,1:4] %*% c(20,20,20,20)
	y <- rpois(100, exp(rate))
	groups <- c(rep(1:4, each = 4), 5)
	fit <- gglasso(x = x, y = y, group = groups)
	plot(fit)

	# effect of both var1 and var5
	x <- model.matrix( ~ ., dplyr::select(df, -y))[, -1]
	rate <- 1 + x[,1:4] %*% c(2,2,2,2) + x[,'var5']
	y <- rpois(100, exp(rate))
	groups <- c(rep(1:4, each = 4), 5)
	fit <- gglasso(x = x, y = y, group = groups)
	plot(fit)

	# pick optimal lambda
	opt <- cv.gglasso(x = x,
	y = y,
	group = groups,
	nfolds = 20)
	plot(opt)
	fit <- gglasso(x = x,
	y = y,
	group = groups)
	plot(fit)
	abline(v = log(opt$lambda.min), lty = 2)
	abline(v = log(opt$lambda.1se), lty = 2)
	legend("bottomright",
	lwd = 1,
	col = rainbow(5 + 1, start = 0.7, end = 0.95)[1:5],
	legend = paste0("var", 1:5),
	cex = .7)

	# seems to work, despite assuming response variable is normal

	# use appropriate poisson family for response

	library(grpreg)
	opt <- cv.grpreg(X = x,
	y = y,
	group = groups,
	penalty = "grLasso",
	nfolds = 20,
	se = "bootstrap")
	opt$lambda.min

	fit <- grpreg(X = x,
	y = y,
	group = groups,
	penalty = "grLasso")

	# by group
	plot(x = fit,
	log.l = TRUE, # lambda on log scale if TRUE
	alpha = 1, # transparency
	norm = TRUE, # by group if TRUE, ind coefficients otherwise
	legend.loc = "topleft")
	abline(v = log(opt$lambda.min), lty = 2)

	# by individual coefficients
	plot(x = fit,
	log.l = TRUE, # lambda on log scale if TRUE
	alpha = 1, # transparency
	norm = FALSE, # by group if TRUE, ind coefficients otherwise
	legend.loc = "topleft")
	abline(v = log(opt$lambda.min), lty = 2)