alexpghayes/llh_encoding_comparison.R

## llh_encoding_comparison.R
library(lme4)
library(recipes)
library(tidyverse)
library(rsample)
library(caret)
library(tidyposterior)
library(nnet)

soup_llhencode <- function(
  recipe, ...,
  role = NA,
  trained = FALSE,
  use_partial_pooling = FALSE,
  skip = FALSE
) {
  terms <- rlang::quos(...)
  if(length(terms) == 0)
    stop("Please supply at least one variable specification. See ?selections.")
  add_step(
    recipe,
    soup_llhencode_new(
      terms = terms,
      trained = trained,
      role = role,
      use_partial_pooling = use_partial_pooling,
      skip = skip))
}

soup_llhencode_new <- function(
  terms = NULL,
  role = NA,
  trained = FALSE,
  use_partial_pooling = FALSE,
  skip = FALSE
) {
  step(
    subclass = "llhencode",
    terms = terms,
    role = role,
    trained = trained,
    use_partial_pooling = use_partial_pooling,
    skip = skip
  )
}

# brittle!
vars_to_formula <- function(outcome, predictor, random_intercept = FALSE) {
  stopifnot(is.character(outcome))
  stopifnot(is.character(predictor))
  if (random_intercept) predictor <- paste0("(1|", predictor, ")")
  as.formula(paste(outcome, "~", predictor))
}

vars_to_formula("Sepal.Length", "Species", T)

# cludge together K - 1 mixed regressions
multinomer <- function(outcome_name, predictor_name, data) {

  outcome_form <- reformulate(termlabels = outcome_name)
  outcome_matrix <- model.matrix(outcome_form, data)[, -1]  # drop a column

  models <- list()

  for (out_name in colnames(outcome_matrix)) {
    formula <- vars_to_formula(outcome_name, predictor_name, random_intercept = TRUE)
    models[[out_name]] <- glmer(formula, data, family = binomial)
  }

  class(models) <- "multinomer"
  models
}

predict.multinomer <- function(object, newdata, ...) {
  codes <- map_dfc(object, predict, newdata, allow.new.levels = TRUE)
  colnames(codes) <- names(object)
  as_tibble(codes)
}


# no data leakage protection yet!
prep.step_llhencode <- function(x, training, info = NULL, ...) {
  x$predictor_names <- terms_select(terms = x$terms, info = info)

  # is there a better way to do this?
  outcome_name <- filter(info, role == "outcome")$variable
  # only deal with univariate outcomes
  stopifnot(length(outcome_name) == 1)
  outcome <- training[[outcome_name]]

  encoders <- list()
  for (pred_name in x$predictor_names) {
    predictor <- training[[pred_name]]
    # prevent unobserved factor errors: think levels(head(iris$Species))
    x$observed_levels <- levels(droplevels(predictor))
    x$mean_outcome <- mean(outcome)

    regression <- is.numeric(outcome)
    pool <- x$use_partial_pooling
    formula <- vars_to_formula(outcome_name, pred_name, random_intercept = pool)

    if (regression && pool) {
      x$encoders[[pred_name]] <- lmer(formula, training)
    } else if (regression) {
      x$encoders[[pred_name]] <- lm(formula, training)
    } else if (classification && !pool) {
      x$encoders[[pred_name]] <- multinom(formula, training)
    } else {
      x$encoders[[pred_name]] <- multinomer(outcome_name, pred_name, training)
    }
  }
  x$trained <- TRUE
  x
}

# doesn't do categorical outcomes yet!
bake.step_llhencode <- function(object, newdata, ...) {

  for (pred_name in object$predictor_names) {

    encoder <- object$encoders[[pred_name]]

    if (object$use_partial_pooling) {
      codes <- predict(encoder, newdata, allow.new.levels = TRUE)
    } else {

      # safe prediction if there are unseen categories in test data
      # new categories are encoded to the outcome grand mean
      # this is consist with the lmer behavior

      new_level_idx <- which(!(newdata[[pred_name]] %in% object$observed_levels))
      newdata[[pred_name]][new_level_idx] <- NA
      codes <- predict(encoder, newdata)
      codes[is.na(codes)] <- object$mean_outcome
    }

    newdata[[pred_name]] <- codes
  }
  as_tibble(newdata)
}

print.step_llhencode <- function(x, ...) {
  cat("Likelihood encoding. Better print method on the way.")
}

## sanity check that it works

train <- iris[1:100, ]
test <- iris[101:150, ]

llhencode <- recipe(Sepal.Length ~ Species + Sepal.Width, train) %>%
  soup_llhencode(Species, use_partial_pooling = FALSE)

prepped <- prep(llhencode, train)
baked <- bake(prepped, test)


llhencode_partial <- recipe(Sepal.Length ~ Species + Sepal.Width, train) %>%
  soup_llhencode(Species, use_partial_pooling = TRUE)

prepped_partial <- prep(llhencode, train)
baked_partial <- bake(prepped_partial, test)

# tests to write:
#   - give different results when using partial pooling
#   - try missing input data the same way
#   - input data is appropriate type


####### START TO COMPARE PARTIAL POOLING VS NO PARTIAL POOLING

### SET UP THE DATA

set.seed(27)

small <- okc %>%
  na.omit() %>%
  sample_frac(0.01) %>%
  mutate(diet = as.factor(diet))

base <- recipe(age ~ diet + height, small) %>%
  step_intercept()

dummy <- base %>%
  step_dummy(diet)

no_pooling <- base %>%
  soup_llhencode(diet)

partial <- base %>%
  soup_llhencode(diet, use_partial_pooling = TRUE)


dummy_model <- train(dummy, small, method = "glmnet")
no_pooling_model <- train(no_pooling, small, method = "glmnet")
partial_model <- train(partial, small, method = "glmnet")

# comparing the performance of various caret models with tidyposterior

# no real difference on this data, but cardinality of category is low

resample_obj <- resamples(list(dummy = dummy_model,
                               no_pooling = no_pooling_model,
                               partial = partial_model))
rmse_model <- perf_mod(resample_obj, seed = 27, verbose = FALSE)
rmse_post <- tidy(rmse_model)
#
ggplot(rmse_post) +
  theme_bw()

all_contrasts <- contrast_models(rmse_model)
ggplot(all_contrasts, size = 0.5)


#### a new attempt, but this time with large movie data

# https://grouplens.org/datasets/movielens/
ratings <- data.table::fread("ml-20m/ratings.csv")  # WOW THIS IS FAST

set.seed(27)

rat <- ratings %>%
  sample_frac(0.00001) %>%  # my computer is not great
  na.omit() %>%
  mutate_at(vars(userId, movieId), as.factor)

base <- recipe(rating ~ movieId + userId, rat) %>% # both movieId and userId have thousands of levels
  step_intercept()

dummy <- base %>%
  step_dummy(movieId, userId)

no_pooling <- base %>%
  soup_llhencode(movieId, userId)

partial <- base %>%
  soup_llhencode(movieId, userId, use_partial_pooling = TRUE)

# step_other, step_zv

dummy_model <- train(dummy, rat, method = "glmnet")
no_pooling_model <- train(no_pooling, rat, method = "glmnet")
partial_model <- train(partial, rat, method = "glmnet")


# other souschef methods:
# - rescale_gelman, rescale_natural_units_y
# - harrell's oblique variable transformation


# comparing the performance of various caret models with tidyposterior

resample_obj <- resamples(list(dummy = dummy_model,
                               no_pooling = no_pooling_model,
                               partial = partial_model))
rmse_model <- perf_mod(resample_obj, seed = 27, verbose = FALSE)
rmse_post <- tidy(rmse_model)

## now there's a difference

ggplot(rmse_post) +
  theme_bw() +
  labs(title = "Posterior distributions of model RMSE",
       subtitle = "rating ~ movieId + userId",
       caption = "Data (subsample) from https://grouplens.org/datasets/movielens/")

ggsave("llh_encoding_comparisons.png")

summary(rmse_post)

all_contrasts <- contrast_models(rmse_model)
ggplot(all_contrasts) +
  theme_bw() +
  labs(title = "Differences between various encodings",
       subtitle = "rating ~ movieId + userId",
       caption = "Data (subsample) from https://grouplens.org/datasets/movielens/")
	library(lme4)
	library(recipes)
	library(tidyverse)
	library(rsample)
	library(caret)
	library(tidyposterior)
	library(nnet)

	soup_llhencode <- function(
	recipe, ...,
	role = NA,
	trained = FALSE,
	use_partial_pooling = FALSE,
	skip = FALSE
	) {
	terms <- rlang::quos(...)
	if(length(terms) == 0)
	stop("Please supply at least one variable specification. See ?selections.")
	add_step(
	recipe,
	soup_llhencode_new(
	terms = terms,
	trained = trained,
	role = role,
	use_partial_pooling = use_partial_pooling,
	skip = skip))
	}

	soup_llhencode_new <- function(
	terms = NULL,
	role = NA,
	trained = FALSE,
	use_partial_pooling = FALSE,
	skip = FALSE
	) {
	step(
	subclass = "llhencode",
	terms = terms,
	role = role,
	trained = trained,
	use_partial_pooling = use_partial_pooling,
	skip = skip
	)
	}

	# brittle!
	vars_to_formula <- function(outcome, predictor, random_intercept = FALSE) {
	stopifnot(is.character(outcome))
	stopifnot(is.character(predictor))
	if (random_intercept) predictor <- paste0("(1\|", predictor, ")")
	as.formula(paste(outcome, "~", predictor))
	}

	vars_to_formula("Sepal.Length", "Species", T)

	# cludge together K - 1 mixed regressions
	multinomer <- function(outcome_name, predictor_name, data) {

	outcome_form <- reformulate(termlabels = outcome_name)
	outcome_matrix <- model.matrix(outcome_form, data)[, -1] # drop a column

	models <- list()

	for (out_name in colnames(outcome_matrix)) {
	formula <- vars_to_formula(outcome_name, predictor_name, random_intercept = TRUE)
	models[[out_name]] <- glmer(formula, data, family = binomial)
	}

	class(models) <- "multinomer"
	models
	}

	predict.multinomer <- function(object, newdata, ...) {
	codes <- map_dfc(object, predict, newdata, allow.new.levels = TRUE)
	colnames(codes) <- names(object)
	as_tibble(codes)
	}


	# no data leakage protection yet!
	prep.step_llhencode <- function(x, training, info = NULL, ...) {
	x$predictor_names <- terms_select(terms = x$terms, info = info)

	# is there a better way to do this?
	outcome_name <- filter(info, role == "outcome")$variable
	# only deal with univariate outcomes
	stopifnot(length(outcome_name) == 1)
	outcome <- training[[outcome_name]]

	encoders <- list()
	for (pred_name in x$predictor_names) {
	predictor <- training[[pred_name]]
	# prevent unobserved factor errors: think levels(head(iris$Species))
	x$observed_levels <- levels(droplevels(predictor))
	x$mean_outcome <- mean(outcome)

	regression <- is.numeric(outcome)
	pool <- x$use_partial_pooling
	formula <- vars_to_formula(outcome_name, pred_name, random_intercept = pool)

	if (regression && pool) {
	x$encoders[[pred_name]] <- lmer(formula, training)
	} else if (regression) {
	x$encoders[[pred_name]] <- lm(formula, training)
	} else if (classification && !pool) {
	x$encoders[[pred_name]] <- multinom(formula, training)
	} else {
	x$encoders[[pred_name]] <- multinomer(outcome_name, pred_name, training)
	}
	}
	x$trained <- TRUE
	x
	}

	# doesn't do categorical outcomes yet!
	bake.step_llhencode <- function(object, newdata, ...) {

	for (pred_name in object$predictor_names) {

	encoder <- object$encoders[[pred_name]]

	if (object$use_partial_pooling) {
	codes <- predict(encoder, newdata, allow.new.levels = TRUE)
	} else {

	# safe prediction if there are unseen categories in test data
	# new categories are encoded to the outcome grand mean
	# this is consist with the lmer behavior

	new_level_idx <- which(!(newdata[[pred_name]] %in% object$observed_levels))
	newdata[[pred_name]][new_level_idx] <- NA
	codes <- predict(encoder, newdata)
	codes[is.na(codes)] <- object$mean_outcome
	}

	newdata[[pred_name]] <- codes
	}
	as_tibble(newdata)
	}

	print.step_llhencode <- function(x, ...) {
	cat("Likelihood encoding. Better print method on the way.")
	}

	## sanity check that it works

	train <- iris[1:100, ]
	test <- iris[101:150, ]

	llhencode <- recipe(Sepal.Length ~ Species + Sepal.Width, train) %>%
	soup_llhencode(Species, use_partial_pooling = FALSE)

	prepped <- prep(llhencode, train)
	baked <- bake(prepped, test)


	llhencode_partial <- recipe(Sepal.Length ~ Species + Sepal.Width, train) %>%
	soup_llhencode(Species, use_partial_pooling = TRUE)

	prepped_partial <- prep(llhencode, train)
	baked_partial <- bake(prepped_partial, test)

	# tests to write:
	# - give different results when using partial pooling
	# - try missing input data the same way
	# - input data is appropriate type


	####### START TO COMPARE PARTIAL POOLING VS NO PARTIAL POOLING

	### SET UP THE DATA

	set.seed(27)

	small <- okc %>%
	na.omit() %>%
	sample_frac(0.01) %>%
	mutate(diet = as.factor(diet))

	base <- recipe(age ~ diet + height, small) %>%
	step_intercept()

	dummy <- base %>%
	step_dummy(diet)

	no_pooling <- base %>%
	soup_llhencode(diet)

	partial <- base %>%
	soup_llhencode(diet, use_partial_pooling = TRUE)


	dummy_model <- train(dummy, small, method = "glmnet")
	no_pooling_model <- train(no_pooling, small, method = "glmnet")
	partial_model <- train(partial, small, method = "glmnet")

	# comparing the performance of various caret models with tidyposterior

	# no real difference on this data, but cardinality of category is low

	resample_obj <- resamples(list(dummy = dummy_model,
	no_pooling = no_pooling_model,
	partial = partial_model))
	rmse_model <- perf_mod(resample_obj, seed = 27, verbose = FALSE)
	rmse_post <- tidy(rmse_model)
	#
	ggplot(rmse_post) +
	theme_bw()

	all_contrasts <- contrast_models(rmse_model)
	ggplot(all_contrasts, size = 0.5)


	#### a new attempt, but this time with large movie data

	# https://grouplens.org/datasets/movielens/
	ratings <- data.table::fread("ml-20m/ratings.csv") # WOW THIS IS FAST

	set.seed(27)

	rat <- ratings %>%
	sample_frac(0.00001) %>% # my computer is not great
	na.omit() %>%
	mutate_at(vars(userId, movieId), as.factor)

	base <- recipe(rating ~ movieId + userId, rat) %>% # both movieId and userId have thousands of levels
	step_intercept()

	dummy <- base %>%
	step_dummy(movieId, userId)

	no_pooling <- base %>%
	soup_llhencode(movieId, userId)

	partial <- base %>%
	soup_llhencode(movieId, userId, use_partial_pooling = TRUE)

	# step_other, step_zv

	dummy_model <- train(dummy, rat, method = "glmnet")
	no_pooling_model <- train(no_pooling, rat, method = "glmnet")
	partial_model <- train(partial, rat, method = "glmnet")


	# other souschef methods:
	# - rescale_gelman, rescale_natural_units_y
	# - harrell's oblique variable transformation


	# comparing the performance of various caret models with tidyposterior

	resample_obj <- resamples(list(dummy = dummy_model,
	no_pooling = no_pooling_model,
	partial = partial_model))
	rmse_model <- perf_mod(resample_obj, seed = 27, verbose = FALSE)
	rmse_post <- tidy(rmse_model)

	## now there's a difference

	ggplot(rmse_post) +
	theme_bw() +
	labs(title = "Posterior distributions of model RMSE",
	subtitle = "rating ~ movieId + userId",
	caption = "Data (subsample) from https://grouplens.org/datasets/movielens/")

	ggsave("llh_encoding_comparisons.png")

	summary(rmse_post)

	all_contrasts <- contrast_models(rmse_model)
	ggplot(all_contrasts) +
	theme_bw() +
	labs(title = "Differences between various encodings",
	subtitle = "rating ~ movieId + userId",
	caption = "Data (subsample) from https://grouplens.org/datasets/movielens/")