Skip to content

Instantly share code, notes, and snippets.

@topepo
Last active July 19, 2019 17:27
Show Gist options
  • Save topepo/e428a24e7bce3fd6f65bf524519c700b to your computer and use it in GitHub Desktop.
Save topepo/e428a24e7bce3fd6f65bf524519c700b to your computer and use it in GitHub Desktop.
prototype for processing grids in terms of sub-models
# _Prototype_ code to find the minimum grid that should be fit for models. This
# exploits the fact that some models can evaluate extra sub-models from the same
# object.
# devtools::install_github("tidymodels/parsnip")
# devtools::install_github("tidymodels/dials")
library(tidymodels)
#> ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.0.2 ──
#> ✔ broom 0.5.1 ✔ purrr 0.3.2
#> ✔ dials 0.0.2.9000 ✔ recipes 0.1.6
#> ✔ dplyr 0.8.3 ✔ rsample 0.0.5
#> ✔ ggplot2 3.2.0 ✔ tibble 2.1.3
#> ✔ infer 0.4.0 ✔ yardstick 0.0.3.9000
#> ✔ parsnip 0.0.2.9000
#> ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
#> ✖ purrr::discard() masks scales::discard()
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
#> ✖ ggplot2::margin() masks dials::margin()
#> ✖ recipes::step() masks stats::step()
min_grid <- function(x, grid, ...) {
# x is a `model_spec` object from parsnip
# grid is a tibble of tuning parameer values with names
# matching the parameter names.
UseMethod("min_grid")
}
# As an example, if we fit a boosted tree model and tune over
# trees = 1:20 and min_n = c(20, 30)
# we should only have to fit two models:
#
# trees = 20 & min_n = 20
# trees = 20 & min_n = 30
#
# The logic related to how this "mini grid" gets made is model-specific.
#
# To get the full set of predictions, we need to know, for each of these two
# models, what values of num_terms to give to the multi_predict() function.
#
# The current idea is to have a list column of the extra models for prediction.
# For the example above:
#
# # A tibble: 2 x 3
# trees min_n .submodels
# <dbl> <dbl> <list>
# 1 20 20 <named list [1]>
# 2 20 30 <named list [1]>
#
# and the .submodels would both be
#
# list(trees = 1:19)
#
# There are a lot of other things to consider in future versions like grids
# where there are multiple columns with the same name (maybe the results of
# a recipe) and so on.
# helper functions
# Template for model results that do no have the sub-model feature
blank_submodels <- function(grid) {
grid %>%
dplyr::mutate(.submodels = map(1:nrow(grid), ~ list()))
}
get_fixed_args <- function(info) {
# Get non-sub-model columns to iterate over
fixed_args <- info$name[!info$has_submodel]
}
get_submodel_info <- function(spec, grid) {
param_info <-
get_from_env(paste0(class(spec)[1], "_args")) %>%
dplyr::filter(engine == spec$engine) %>%
dplyr::select(name = parsnip, has_submodel)
# In case a recipe or other activity has grid parameter columns,
# add those to the results
grid_names <- names(grid)
is_mod_param <- grid_names %in% param_info$name
if (any(!is_mod_param)) {
param_info <-
param_info %>%
bind_rows(
tibble(name = grid_names[!is_mod_param],
has_submodel = FALSE)
)
}
param_info %>% dplyr::filter(name %in% grid_names)
}
min_grid.boost_tree <- function(x, grid, ...) {
grid_names <- names(grid)
param_info <- get_submodel_info(x, grid)
# No ability to do submodels? Finish here:
if (!any(param_info$has_submodel)) {
return(blank_submodels(grid))
}
fixed_args <- get_fixed_args(param_info)
# For boosted trees, fit the model with the most trees (conditional on the
# other parameters) so that you can do predictions on the smaller models.
fit_only <-
grid %>%
dplyr::group_by(!!!syms(fixed_args)) %>%
dplyr::summarize(trees = max(trees, na.rm = TRUE)) %>%
dplyr::ungroup()
# Add a column .submodels that is a list with what should be predicted
# by `multi_predict()` (assuming `predict()` has already been executed
# on the original value of 'trees')
min_grid_df <-
dplyr::full_join(fit_only %>% rename(max_tree = trees), grid, by = fixed_args) %>%
dplyr::filter(trees != max_tree) %>%
dplyr::group_by(!!!syms(fixed_args)) %>%
dplyr::summarize(.submodels = list(list(trees = trees))) %>%
dplyr::ungroup() %>%
dplyr::full_join(fit_only, grid, by = fixed_args)
min_grid_df %>% dplyr::select(one_of(grid_names), .submodels)
}
# Examples
# simple grids
boosting_spec <- boost_tree() %>% set_engine("xgboost")
basic_grid <- grid_regular(trees(), min_n(c(10, 20)), tree_depth(), levels = 3)
smaller_grid <- min_grid(boosting_spec, basic_grid)
smaller_grid
#> # A tibble: 9 x 4
#> trees min_n tree_depth .submodels
#> <int> <int> <int> <list>
#> 1 2000 10 2 <named list [1]>
#> 2 2000 15 2 <named list [1]>
#> 3 2000 20 2 <named list [1]>
#> 4 2000 10 8 <named list [1]>
#> 5 2000 15 8 <named list [1]>
#> 6 2000 20 8 <named list [1]>
#> 7 2000 10 15 <named list [1]>
#> 8 2000 15 15 <named list [1]>
#> 9 2000 20 15 <named list [1]>
smaller_grid$.submodels[[1]]
#> $trees
#> [1] 1 1000
# non-regular grids
set.seed(35)
filler_grid <- grid_max_entropy(trees(), min_n(c(10, 20)), tree_depth(), size = 5)
min_grid(boosting_spec, filler_grid)
#> # A tibble: 4 x 4
#> trees min_n tree_depth .submodels
#> <int> <int> <int> <list>
#> 1 1259 12 3 <named list [1]>
#> 2 330 11 11 <NULL>
#> 3 1831 19 12 <NULL>
#> 4 1342 11 14 <NULL>
# an engine that does not support submodels:
spark_spec <- boost_tree() %>% set_engine("spark")
min_grid(spark_spec, basic_grid)
#> # A tibble: 27 x 4
#> trees min_n tree_depth .submodels
#> * <int> <int> <int> <list>
#> 1 1 10 2 <list [0]>
#> 2 1000 10 2 <list [0]>
#> 3 2000 10 2 <list [0]>
#> 4 1 15 2 <list [0]>
#> 5 1000 15 2 <list [0]>
#> 6 2000 15 2 <list [0]>
#> 7 1 20 2 <list [0]>
#> 8 1000 20 2 <list [0]>
#> 9 2000 20 2 <list [0]>
#> 10 1 10 8 <list [0]>
#> # … with 17 more rows
min_grid.nearest_neighbor_kknn <- function(x, grid, ...) {
arg_info <-
get_from_env(paste0(class(x)[1], "_args")) %>%
dplyr::filter(engine == x$engine & parsnip %in% names(grid))
all_param <- arg_info$parsnip
# If no submodels, just give the grid back with an extra column
if (!any(arg_info$has_submodel)) {
return(blank_submodels(grid))
}
# Get non-sub-model columns to iterate over
fixed_args <- arg_info$parsnip[!arg_info$has_submodel]
fixed_syms <- rlang::syms(fixed_args)
fit_only <-
grid %>%
dplyr::group_by(!!!fixed_syms) %>%
# could do any value of k but we'll go with the max
dplyr::summarize(neighbors = max(neighbors, na.rm = TRUE)) %>%
dplyr::ungroup()
min_grid_df <-
dplyr::full_join(fit_only %>% rename(max_neighbors = neighbors), grid, by = fixed_args) %>%
dplyr::filter(neighbors != max_neighbors) %>%
dplyr::group_by(!!!fixed_syms) %>%
dplyr::summarize(.submodels = list(list(neighbors = neighbors))) %>%
dplyr::ungroup() %>%
dplyr::full_join(fit_only, grid, by = fixed_args)
min_grid_df %>% dplyr::select(!!!all_param, .submodels)
}
knn_grid <-
grid_regular(neighbors(c(1, 10)), dist_power(), weight_func(), levels = 3) %>%
# just for kicks:
slice(-4)
min_grid.nearest_neighbor_kknn(nearest_neighbor() %>% set_engine("kknn"), knn_grid)
#> # A tibble: 9 x 4
#> neighbors weight_func dist_power .submodels
#> <int> <chr> <dbl> <list>
#> 1 10 epanechnikov 1 <named list [1]>
#> 2 10 epanechnikov 1.5 <named list [1]>
#> 3 10 epanechnikov 2 <named list [1]>
#> 4 10 rectangular 1 <named list [1]>
#> 5 10 rectangular 1.5 <named list [1]>
#> 6 10 rectangular 2 <named list [1]>
#> 7 10 triangular 1 <named list [1]>
#> 8 10 triangular 1.5 <named list [1]>
#> 9 10 triangular 2 <named list [1]>
knn_grid2 <- grid_max_entropy(neighbors(c(1, 10)), dist_power(), weight_func(), size = 10)
min_grid.nearest_neighbor_kknn(nearest_neighbor() %>% set_engine("kknn"), knn_grid2)
#> # A tibble: 10 x 4
#> neighbors weight_func dist_power .submodels
#> <int> <chr> <dbl> <list>
#> 1 2 biweight 1.04 <NULL>
#> 2 9 epanechnikov 1.10 <NULL>
#> 3 9 gaussian 1.06 <NULL>
#> 4 9 gaussian 1.78 <NULL>
#> 5 3 rank 1.12 <NULL>
#> 6 4 rank 1.81 <NULL>
#> 7 9 rectangular 1.51 <NULL>
#> 8 1 rectangular 1.89 <NULL>
#> 9 9 rectangular 1.95 <NULL>
#> 10 6 triweight 1.44 <NULL>
min_grid.mars_earth <- function(x, grid, ...) {
arg_info <-
get_from_env(paste0(class(x)[1], "_args")) %>%
dplyr::filter(engine == x$engine & parsnip %in% names(grid))
all_param <- arg_info$parsnip
# If no submodels, just give the grid back with an extra column
if (!any(arg_info$has_submodel)) {
return(blank_submodels(grid))
}
# Get non-sub-model columns to iterate over
fixed_args <- arg_info$parsnip[!arg_info$has_submodel]
fixed_syms <- rlang::syms(fixed_args)
fit_only <-
grid %>%
dplyr::group_by(!!!fixed_syms) %>%
dplyr::summarize(num_terms = max(num_terms, na.rm = TRUE)) %>%
dplyr::ungroup()
min_grid_df <-
dplyr::full_join(fit_only %>% rename(max_terms = num_terms), grid, by = fixed_args) %>%
dplyr::filter(num_terms != max_terms) %>%
dplyr::group_by(!!!fixed_syms) %>%
dplyr::summarize(.submodels = list(list(num_terms = num_terms))) %>%
dplyr::ungroup() %>%
dplyr::full_join(fit_only, grid, by = fixed_args)
min_grid_df %>% dplyr::select(!!!all_param, .submodels)
}
min_grid.linear_reg_glmnet <- function(x, grid, ...) {
arg_info <-
get_from_env(paste0(class(x)[1], "_args")) %>%
dplyr::filter(engine == x$engine & parsnip %in% names(grid))
all_param <- arg_info$parsnip
# If no submodels, just give the grid back with an extra column
if (!any(arg_info$has_submodel)) {
return(blank_submodels(grid))
}
fit_only <-
grid %>%
distinct(mixture)
# To get entire regularization path, we will set lambda = NULL. Since you
# can't have a column of NULLS in a data frame, we set to NA.
min_grid_df <-
grid %>%
group_by(mixture) %>%
nest(-mixture) %>%
ungroup() %>%
mutate(
penalty = NA_real_, # should really be set to NULL; fix in translate function
.submodels = map(data, pull, 1)
) %>%
dplyr::select(-data)
min_grid_df %>% dplyr::select(!!!all_param, .submodels)
}
min_grid.logistic_reg_glmnet <- min_grid.linear_reg_glmnet
xgb_full_grid <-
crossing(tree_depth = 10:11, min_n = 5:7, trees = 1:3)
xgb_full_exp <-
crossing(tree_depth = 10:11, min_n = 5:7, trees = 3) %>%
mutate(.submodels = map(trees, ~ 1:2)) %>%
dplyr::select(tree_depth, trees, min_n, .submodels)
xgb_full_res <-
min_grid.boost_tree_xgboost(
boost_tree() %>% set_engine("xgboost"),
xgb_full_grid
)
#> Error in min_grid.boost_tree_xgboost(boost_tree() %>% set_engine("xgboost"), : could not find function "min_grid.boost_tree_xgboost"
# Need to use vctrs:
# Error: Can't join on '.submodels' x '.submodels' because of incompatible types (list / list)
# all_equal(xgb_full_res, xgb_full_exp)
# re-use test_by_col() here
xgb_smol_grid <-
crossing(tree_depth = 10:11, min_n = 5:7)
xgb_full_exp <-
crossing(tree_depth = 10:11, min_n = 5:7) %>%
mutate(.submodels = map(tree_depth, ~ list()))
min_grid.boost_tree_xgboost(
boost_tree() %>% set_engine("xgboost"),
xgb_smol_grid
)
#> Error in min_grid.boost_tree_xgboost(boost_tree() %>% set_engine("xgboost"), : could not find function "min_grid.boost_tree_xgboost"
c5_full_grid <-
crossing(min_n = 5:7, trees = 1:3)
c5_full_exp <-
crossing(min_n = 5:7, trees = 3) %>%
mutate(.submodels = map(trees, ~ 1:2)) %>%
dplyr::select(trees, min_n, .submodels)
c5_full_res <-
min_grid.boost_tree_xgboost(
boost_tree() %>% set_engine("C5.0"),
c5_full_grid
)
#> Error in min_grid.boost_tree_xgboost(boost_tree() %>% set_engine("C5.0"), : could not find function "min_grid.boost_tree_xgboost"
c5_smol_grid <-
crossing(min_n = 5:7)
c5_full_exp <-
crossing(min_n = 5:7) %>%
mutate(.submodels = map(min_n, ~ list()))
min_grid.boost_tree_xgboost(
boost_tree() %>% set_engine("C5.0"),
c5_smol_grid
)
#> Error in min_grid.boost_tree_xgboost(boost_tree() %>% set_engine("C5.0"), : could not find function "min_grid.boost_tree_xgboost"
earth_full_grid <-
crossing(num_terms = 10:11, prod_degree = 1:2, prune_method = c("none", "backward"))
earth_full_exp <-
crossing(num_terms = 11, prod_degree = 1:2, prune_method = c("none", "backward")) %>%
mutate(.submodels = map(prod_degree, ~ 10)) %>%
dplyr::select(num_terms, prod_degree, prune_method, .submodels)
earth_full_res <-
min_grid.mars_earth(
mars() %>% set_engine("earth"),
earth_full_grid
)
glmn_full_grid <-
crossing(mixture = (0:3)/3, penalty = c(0, .1, 1))
glmn_full_exp <-
crossing(penalty = NA_real_, mixture = (0:3)/3) %>%
mutate(.submodels = map(mixture, ~ c(0, .1, 1)))
glmn_full_res <-
min_grid.linear_reg_glmnet(
linear_reg() %>% set_engine("glmnet"),
glmn_full_grid
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment