Skip to content

Instantly share code, notes, and snippets.

@alexpghayes
Created May 19, 2018 19:11
Show Gist options
  • Save alexpghayes/d1ffb5c78b2e063d29b415a1a6bcb44e to your computer and use it in GitHub Desktop.
Save alexpghayes/d1ffb5c78b2e063d29b415a1a6bcb44e to your computer and use it in GitHub Desktop.
library(lme4)
library(recipes)
library(tidyverse)
library(rsample)
library(caret)
library(tidyposterior)
library(nnet)
soup_llhencode <- function(
recipe, ...,
role = NA,
trained = FALSE,
use_partial_pooling = FALSE,
skip = FALSE
) {
terms <- rlang::quos(...)
if(length(terms) == 0)
stop("Please supply at least one variable specification. See ?selections.")
add_step(
recipe,
soup_llhencode_new(
terms = terms,
trained = trained,
role = role,
use_partial_pooling = use_partial_pooling,
skip = skip))
}
soup_llhencode_new <- function(
terms = NULL,
role = NA,
trained = FALSE,
use_partial_pooling = FALSE,
skip = FALSE
) {
step(
subclass = "llhencode",
terms = terms,
role = role,
trained = trained,
use_partial_pooling = use_partial_pooling,
skip = skip
)
}
# brittle!
vars_to_formula <- function(outcome, predictor, random_intercept = FALSE) {
stopifnot(is.character(outcome))
stopifnot(is.character(predictor))
if (random_intercept) predictor <- paste0("(1|", predictor, ")")
as.formula(paste(outcome, "~", predictor))
}
vars_to_formula("Sepal.Length", "Species", T)
# cludge together K - 1 mixed regressions
multinomer <- function(outcome_name, predictor_name, data) {
outcome_form <- reformulate(termlabels = outcome_name)
outcome_matrix <- model.matrix(outcome_form, data)[, -1] # drop a column
models <- list()
for (out_name in colnames(outcome_matrix)) {
formula <- vars_to_formula(outcome_name, predictor_name, random_intercept = TRUE)
models[[out_name]] <- glmer(formula, data, family = binomial)
}
class(models) <- "multinomer"
models
}
predict.multinomer <- function(object, newdata, ...) {
codes <- map_dfc(object, predict, newdata, allow.new.levels = TRUE)
colnames(codes) <- names(object)
as_tibble(codes)
}
# no data leakage protection yet!
prep.step_llhencode <- function(x, training, info = NULL, ...) {
x$predictor_names <- terms_select(terms = x$terms, info = info)
# is there a better way to do this?
outcome_name <- filter(info, role == "outcome")$variable
# only deal with univariate outcomes
stopifnot(length(outcome_name) == 1)
outcome <- training[[outcome_name]]
encoders <- list()
for (pred_name in x$predictor_names) {
predictor <- training[[pred_name]]
# prevent unobserved factor errors: think levels(head(iris$Species))
x$observed_levels <- levels(droplevels(predictor))
x$mean_outcome <- mean(outcome)
regression <- is.numeric(outcome)
pool <- x$use_partial_pooling
formula <- vars_to_formula(outcome_name, pred_name, random_intercept = pool)
if (regression && pool) {
x$encoders[[pred_name]] <- lmer(formula, training)
} else if (regression) {
x$encoders[[pred_name]] <- lm(formula, training)
} else if (classification && !pool) {
x$encoders[[pred_name]] <- multinom(formula, training)
} else {
x$encoders[[pred_name]] <- multinomer(outcome_name, pred_name, training)
}
}
x$trained <- TRUE
x
}
# doesn't do categorical outcomes yet!
bake.step_llhencode <- function(object, newdata, ...) {
for (pred_name in object$predictor_names) {
encoder <- object$encoders[[pred_name]]
if (object$use_partial_pooling) {
codes <- predict(encoder, newdata, allow.new.levels = TRUE)
} else {
# safe prediction if there are unseen categories in test data
# new categories are encoded to the outcome grand mean
# this is consist with the lmer behavior
new_level_idx <- which(!(newdata[[pred_name]] %in% object$observed_levels))
newdata[[pred_name]][new_level_idx] <- NA
codes <- predict(encoder, newdata)
codes[is.na(codes)] <- object$mean_outcome
}
newdata[[pred_name]] <- codes
}
as_tibble(newdata)
}
print.step_llhencode <- function(x, ...) {
cat("Likelihood encoding. Better print method on the way.")
}
## sanity check that it works
train <- iris[1:100, ]
test <- iris[101:150, ]
llhencode <- recipe(Sepal.Length ~ Species + Sepal.Width, train) %>%
soup_llhencode(Species, use_partial_pooling = FALSE)
prepped <- prep(llhencode, train)
baked <- bake(prepped, test)
llhencode_partial <- recipe(Sepal.Length ~ Species + Sepal.Width, train) %>%
soup_llhencode(Species, use_partial_pooling = TRUE)
prepped_partial <- prep(llhencode, train)
baked_partial <- bake(prepped_partial, test)
# tests to write:
# - give different results when using partial pooling
# - try missing input data the same way
# - input data is appropriate type
####### START TO COMPARE PARTIAL POOLING VS NO PARTIAL POOLING
### SET UP THE DATA
set.seed(27)
small <- okc %>%
na.omit() %>%
sample_frac(0.01) %>%
mutate(diet = as.factor(diet))
base <- recipe(age ~ diet + height, small) %>%
step_intercept()
dummy <- base %>%
step_dummy(diet)
no_pooling <- base %>%
soup_llhencode(diet)
partial <- base %>%
soup_llhencode(diet, use_partial_pooling = TRUE)
dummy_model <- train(dummy, small, method = "glmnet")
no_pooling_model <- train(no_pooling, small, method = "glmnet")
partial_model <- train(partial, small, method = "glmnet")
# comparing the performance of various caret models with tidyposterior
# no real difference on this data, but cardinality of category is low
resample_obj <- resamples(list(dummy = dummy_model,
no_pooling = no_pooling_model,
partial = partial_model))
rmse_model <- perf_mod(resample_obj, seed = 27, verbose = FALSE)
rmse_post <- tidy(rmse_model)
#
ggplot(rmse_post) +
theme_bw()
all_contrasts <- contrast_models(rmse_model)
ggplot(all_contrasts, size = 0.5)
#### a new attempt, but this time with large movie data
# https://grouplens.org/datasets/movielens/
ratings <- data.table::fread("ml-20m/ratings.csv") # WOW THIS IS FAST
set.seed(27)
rat <- ratings %>%
sample_frac(0.00001) %>% # my computer is not great
na.omit() %>%
mutate_at(vars(userId, movieId), as.factor)
base <- recipe(rating ~ movieId + userId, rat) %>% # both movieId and userId have thousands of levels
step_intercept()
dummy <- base %>%
step_dummy(movieId, userId)
no_pooling <- base %>%
soup_llhencode(movieId, userId)
partial <- base %>%
soup_llhencode(movieId, userId, use_partial_pooling = TRUE)
# step_other, step_zv
dummy_model <- train(dummy, rat, method = "glmnet")
no_pooling_model <- train(no_pooling, rat, method = "glmnet")
partial_model <- train(partial, rat, method = "glmnet")
# other souschef methods:
# - rescale_gelman, rescale_natural_units_y
# - harrell's oblique variable transformation
# comparing the performance of various caret models with tidyposterior
resample_obj <- resamples(list(dummy = dummy_model,
no_pooling = no_pooling_model,
partial = partial_model))
rmse_model <- perf_mod(resample_obj, seed = 27, verbose = FALSE)
rmse_post <- tidy(rmse_model)
## now there's a difference
ggplot(rmse_post) +
theme_bw() +
labs(title = "Posterior distributions of model RMSE",
subtitle = "rating ~ movieId + userId",
caption = "Data (subsample) from https://grouplens.org/datasets/movielens/")
ggsave("llh_encoding_comparisons.png")
summary(rmse_post)
all_contrasts <- contrast_models(rmse_model)
ggplot(all_contrasts) +
theme_bw() +
labs(title = "Differences between various encodings",
subtitle = "rating ~ movieId + userId",
caption = "Data (subsample) from https://grouplens.org/datasets/movielens/")
@topepo
Copy link

topepo commented May 23, 2018

Pretty cool. So is your base recipe using a data table as input?

@alexpghayes
Copy link
Author

Ooops just saw this. I thought it wasn't but apparently it is:

readr::write_csv(iris, "iris.csv")
df <- data.table::fread("iris.csv")
class(df)
#> [1] "data.table" "data.frame"

fread totally blows my mind. Three times faster than read_csv for me.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment