Created
May 19, 2018 19:11
-
-
Save alexpghayes/d1ffb5c78b2e063d29b415a1a6bcb44e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(lme4) | |
library(recipes) | |
library(tidyverse) | |
library(rsample) | |
library(caret) | |
library(tidyposterior) | |
library(nnet) | |
soup_llhencode <- function( | |
recipe, ..., | |
role = NA, | |
trained = FALSE, | |
use_partial_pooling = FALSE, | |
skip = FALSE | |
) { | |
terms <- rlang::quos(...) | |
if(length(terms) == 0) | |
stop("Please supply at least one variable specification. See ?selections.") | |
add_step( | |
recipe, | |
soup_llhencode_new( | |
terms = terms, | |
trained = trained, | |
role = role, | |
use_partial_pooling = use_partial_pooling, | |
skip = skip)) | |
} | |
soup_llhencode_new <- function( | |
terms = NULL, | |
role = NA, | |
trained = FALSE, | |
use_partial_pooling = FALSE, | |
skip = FALSE | |
) { | |
step( | |
subclass = "llhencode", | |
terms = terms, | |
role = role, | |
trained = trained, | |
use_partial_pooling = use_partial_pooling, | |
skip = skip | |
) | |
} | |
# brittle! | |
vars_to_formula <- function(outcome, predictor, random_intercept = FALSE) { | |
stopifnot(is.character(outcome)) | |
stopifnot(is.character(predictor)) | |
if (random_intercept) predictor <- paste0("(1|", predictor, ")") | |
as.formula(paste(outcome, "~", predictor)) | |
} | |
vars_to_formula("Sepal.Length", "Species", T) | |
# cludge together K - 1 mixed regressions | |
multinomer <- function(outcome_name, predictor_name, data) { | |
outcome_form <- reformulate(termlabels = outcome_name) | |
outcome_matrix <- model.matrix(outcome_form, data)[, -1] # drop a column | |
models <- list() | |
for (out_name in colnames(outcome_matrix)) { | |
formula <- vars_to_formula(outcome_name, predictor_name, random_intercept = TRUE) | |
models[[out_name]] <- glmer(formula, data, family = binomial) | |
} | |
class(models) <- "multinomer" | |
models | |
} | |
predict.multinomer <- function(object, newdata, ...) { | |
codes <- map_dfc(object, predict, newdata, allow.new.levels = TRUE) | |
colnames(codes) <- names(object) | |
as_tibble(codes) | |
} | |
# no data leakage protection yet! | |
prep.step_llhencode <- function(x, training, info = NULL, ...) { | |
x$predictor_names <- terms_select(terms = x$terms, info = info) | |
# is there a better way to do this? | |
outcome_name <- filter(info, role == "outcome")$variable | |
# only deal with univariate outcomes | |
stopifnot(length(outcome_name) == 1) | |
outcome <- training[[outcome_name]] | |
encoders <- list() | |
for (pred_name in x$predictor_names) { | |
predictor <- training[[pred_name]] | |
# prevent unobserved factor errors: think levels(head(iris$Species)) | |
x$observed_levels <- levels(droplevels(predictor)) | |
x$mean_outcome <- mean(outcome) | |
regression <- is.numeric(outcome) | |
pool <- x$use_partial_pooling | |
formula <- vars_to_formula(outcome_name, pred_name, random_intercept = pool) | |
if (regression && pool) { | |
x$encoders[[pred_name]] <- lmer(formula, training) | |
} else if (regression) { | |
x$encoders[[pred_name]] <- lm(formula, training) | |
} else if (classification && !pool) { | |
x$encoders[[pred_name]] <- multinom(formula, training) | |
} else { | |
x$encoders[[pred_name]] <- multinomer(outcome_name, pred_name, training) | |
} | |
} | |
x$trained <- TRUE | |
x | |
} | |
# doesn't do categorical outcomes yet! | |
bake.step_llhencode <- function(object, newdata, ...) { | |
for (pred_name in object$predictor_names) { | |
encoder <- object$encoders[[pred_name]] | |
if (object$use_partial_pooling) { | |
codes <- predict(encoder, newdata, allow.new.levels = TRUE) | |
} else { | |
# safe prediction if there are unseen categories in test data | |
# new categories are encoded to the outcome grand mean | |
# this is consist with the lmer behavior | |
new_level_idx <- which(!(newdata[[pred_name]] %in% object$observed_levels)) | |
newdata[[pred_name]][new_level_idx] <- NA | |
codes <- predict(encoder, newdata) | |
codes[is.na(codes)] <- object$mean_outcome | |
} | |
newdata[[pred_name]] <- codes | |
} | |
as_tibble(newdata) | |
} | |
print.step_llhencode <- function(x, ...) { | |
cat("Likelihood encoding. Better print method on the way.") | |
} | |
## sanity check that it works | |
train <- iris[1:100, ] | |
test <- iris[101:150, ] | |
llhencode <- recipe(Sepal.Length ~ Species + Sepal.Width, train) %>% | |
soup_llhencode(Species, use_partial_pooling = FALSE) | |
prepped <- prep(llhencode, train) | |
baked <- bake(prepped, test) | |
llhencode_partial <- recipe(Sepal.Length ~ Species + Sepal.Width, train) %>% | |
soup_llhencode(Species, use_partial_pooling = TRUE) | |
prepped_partial <- prep(llhencode, train) | |
baked_partial <- bake(prepped_partial, test) | |
# tests to write: | |
# - give different results when using partial pooling | |
# - try missing input data the same way | |
# - input data is appropriate type | |
####### START TO COMPARE PARTIAL POOLING VS NO PARTIAL POOLING | |
### SET UP THE DATA | |
set.seed(27) | |
small <- okc %>% | |
na.omit() %>% | |
sample_frac(0.01) %>% | |
mutate(diet = as.factor(diet)) | |
base <- recipe(age ~ diet + height, small) %>% | |
step_intercept() | |
dummy <- base %>% | |
step_dummy(diet) | |
no_pooling <- base %>% | |
soup_llhencode(diet) | |
partial <- base %>% | |
soup_llhencode(diet, use_partial_pooling = TRUE) | |
dummy_model <- train(dummy, small, method = "glmnet") | |
no_pooling_model <- train(no_pooling, small, method = "glmnet") | |
partial_model <- train(partial, small, method = "glmnet") | |
# comparing the performance of various caret models with tidyposterior | |
# no real difference on this data, but cardinality of category is low | |
resample_obj <- resamples(list(dummy = dummy_model, | |
no_pooling = no_pooling_model, | |
partial = partial_model)) | |
rmse_model <- perf_mod(resample_obj, seed = 27, verbose = FALSE) | |
rmse_post <- tidy(rmse_model) | |
# | |
ggplot(rmse_post) + | |
theme_bw() | |
all_contrasts <- contrast_models(rmse_model) | |
ggplot(all_contrasts, size = 0.5) | |
#### a new attempt, but this time with large movie data | |
# https://grouplens.org/datasets/movielens/ | |
ratings <- data.table::fread("ml-20m/ratings.csv") # WOW THIS IS FAST | |
set.seed(27) | |
rat <- ratings %>% | |
sample_frac(0.00001) %>% # my computer is not great | |
na.omit() %>% | |
mutate_at(vars(userId, movieId), as.factor) | |
base <- recipe(rating ~ movieId + userId, rat) %>% # both movieId and userId have thousands of levels | |
step_intercept() | |
dummy <- base %>% | |
step_dummy(movieId, userId) | |
no_pooling <- base %>% | |
soup_llhencode(movieId, userId) | |
partial <- base %>% | |
soup_llhencode(movieId, userId, use_partial_pooling = TRUE) | |
# step_other, step_zv | |
dummy_model <- train(dummy, rat, method = "glmnet") | |
no_pooling_model <- train(no_pooling, rat, method = "glmnet") | |
partial_model <- train(partial, rat, method = "glmnet") | |
# other souschef methods: | |
# - rescale_gelman, rescale_natural_units_y | |
# - harrell's oblique variable transformation | |
# comparing the performance of various caret models with tidyposterior | |
resample_obj <- resamples(list(dummy = dummy_model, | |
no_pooling = no_pooling_model, | |
partial = partial_model)) | |
rmse_model <- perf_mod(resample_obj, seed = 27, verbose = FALSE) | |
rmse_post <- tidy(rmse_model) | |
## now there's a difference | |
ggplot(rmse_post) + | |
theme_bw() + | |
labs(title = "Posterior distributions of model RMSE", | |
subtitle = "rating ~ movieId + userId", | |
caption = "Data (subsample) from https://grouplens.org/datasets/movielens/") | |
ggsave("llh_encoding_comparisons.png") | |
summary(rmse_post) | |
all_contrasts <- contrast_models(rmse_model) | |
ggplot(all_contrasts) + | |
theme_bw() + | |
labs(title = "Differences between various encodings", | |
subtitle = "rating ~ movieId + userId", | |
caption = "Data (subsample) from https://grouplens.org/datasets/movielens/") |
Ooops just saw this. I thought it wasn't but apparently it is:
readr::write_csv(iris, "iris.csv")
df <- data.table::fread("iris.csv")
class(df)
#> [1] "data.table" "data.frame"
fread
totally blows my mind. Three times faster than read_csv
for me.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Pretty cool. So is your
base
recipe using a data table as input?