Skip to content

Instantly share code, notes, and snippets.

@DexGroves
Created July 7, 2015 16:38
Show Gist options
  • Save DexGroves/7defa58c90d48b3c1168 to your computer and use it in GitHub Desktop.
Save DexGroves/7defa58c90d48b3c1168 to your computer and use it in GitHub Desktop.
contrast_matrix <- function(object, data, ...) {
# Make a model matrix with one column per factor level, rather than base hacks
object_vars <- all.vars(object)
factor_vars <- colnames(data)[sapply(data, is.factor) &
colnames(data) %in% object_vars]
model.matrix(object = object,
...,
data = data,
contrasts.arg = lapply(data[, factor_vars],
contrasts,
contrasts = FALSE))
}
generate_data <- function(N) {
# Make some fake, noisy data with a few factors
dt <- data.table(factor_a = as.factor(sample(letters[1:4], N, TRUE)),
factor_b = as.factor(sample(LETTERS[1:4], N, TRUE)),
num_a = runif(N),
num_b = runif(N),
noise_a = rnorm(N),
noise_b = rnorm(N),
noise_factor = as.factor(sample(letters[20:26], N, TRUE)))
# Make a/A levels sparser
dt[runif(N) > 0.1 & factor_a == "a", factor_a := "d"]
dt[runif(N) > 0.1 & factor_b == "A", factor_b := "D"]
dt[, response := 1 * as.numeric(factor_a) +
2 * as.numeric(factor_b) +
0.25 * num_a +
0.5 * num_b +
rnorm(N) * 7]
data.frame(dt)
}
mse <- function(y, u) {
# Mean squared error
sqrt(sum((y - u)^2) / length(y))
}
library("glmnet")
library("data.table")
library("doMC")
library("foreach")
seeds <- sample(10000:99999, 10000)
registerDoMC(20)
results <- foreach(seed = seeds, .combine = rbind) %dopar% {
# Data synthesis =============================================================
N <- 5000
nfolds <- 10
training <- generate_data(N)
holdout <- generate_data(50000)
folds <- sample(1:nfolds, N, TRUE)
m_formula <- response ~ factor_a + factor_b +
num_a + num_b +
noise_a + noise_b +
noise_factor
train_mm <- model.matrix(m_formula, training)
hold_mm <- model.matrix(m_formula, holdout)
train_cm <- contrast_matrix(m_formula, training)
hold_cm <- contrast_matrix(m_formula, holdout)
# Models =====================================================================
# Rolled version
g_mm <- cv.glmnet(train_mm, training$response, foldid = folds,
parallel = FALSE)
# Unrolled version
g_cm <- cv.glmnet(train_cm, training$response, foldid = folds,
parallel = FALSE)
# Predict
p_mm <- predict(g_mm, newx = hold_mm, s = "lambda.min")
p_cm <- predict(g_cm, newx = hold_cm, s = "lambda.min")
c(mm = mse(holdout$response, p_mm), cm = mse(holdout$response, p_cm))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment