Created
July 7, 2015 16:38
-
-
Save DexGroves/7defa58c90d48b3c1168 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
contrast_matrix <- function(object, data, ...) { | |
# Make a model matrix with one column per factor level, rather than base hacks | |
object_vars <- all.vars(object) | |
factor_vars <- colnames(data)[sapply(data, is.factor) & | |
colnames(data) %in% object_vars] | |
model.matrix(object = object, | |
..., | |
data = data, | |
contrasts.arg = lapply(data[, factor_vars], | |
contrasts, | |
contrasts = FALSE)) | |
} | |
generate_data <- function(N) { | |
# Make some fake, noisy data with a few factors | |
dt <- data.table(factor_a = as.factor(sample(letters[1:4], N, TRUE)), | |
factor_b = as.factor(sample(LETTERS[1:4], N, TRUE)), | |
num_a = runif(N), | |
num_b = runif(N), | |
noise_a = rnorm(N), | |
noise_b = rnorm(N), | |
noise_factor = as.factor(sample(letters[20:26], N, TRUE))) | |
# Make a/A levels sparser | |
dt[runif(N) > 0.1 & factor_a == "a", factor_a := "d"] | |
dt[runif(N) > 0.1 & factor_b == "A", factor_b := "D"] | |
dt[, response := 1 * as.numeric(factor_a) + | |
2 * as.numeric(factor_b) + | |
0.25 * num_a + | |
0.5 * num_b + | |
rnorm(N) * 7] | |
data.frame(dt) | |
} | |
mse <- function(y, u) { | |
# Mean squared error | |
sqrt(sum((y - u)^2) / length(y)) | |
} | |
library("glmnet") | |
library("data.table") | |
library("doMC") | |
library("foreach") | |
seeds <- sample(10000:99999, 10000) | |
registerDoMC(20) | |
results <- foreach(seed = seeds, .combine = rbind) %dopar% { | |
# Data synthesis ============================================================= | |
N <- 5000 | |
nfolds <- 10 | |
training <- generate_data(N) | |
holdout <- generate_data(50000) | |
folds <- sample(1:nfolds, N, TRUE) | |
m_formula <- response ~ factor_a + factor_b + | |
num_a + num_b + | |
noise_a + noise_b + | |
noise_factor | |
train_mm <- model.matrix(m_formula, training) | |
hold_mm <- model.matrix(m_formula, holdout) | |
train_cm <- contrast_matrix(m_formula, training) | |
hold_cm <- contrast_matrix(m_formula, holdout) | |
# Models ===================================================================== | |
# Rolled version | |
g_mm <- cv.glmnet(train_mm, training$response, foldid = folds, | |
parallel = FALSE) | |
# Unrolled version | |
g_cm <- cv.glmnet(train_cm, training$response, foldid = folds, | |
parallel = FALSE) | |
# Predict | |
p_mm <- predict(g_mm, newx = hold_mm, s = "lambda.min") | |
p_cm <- predict(g_cm, newx = hold_cm, s = "lambda.min") | |
c(mm = mse(holdout$response, p_mm), cm = mse(holdout$response, p_cm)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment