An issue was recently filed in stacks about the object size of a stack increasing on save and reload.
Starting out with a quick reprex:
library(tidymodels)
library(modeldata)
library(readr)
#>
#> Attaching package: 'readr'
#> The following object is masked from 'package:yardstick':
#>
#> spec
#> The following object is masked from 'package:scales':
#>
#> col_factor
library(lobstr)
library(stacks)
library(butcher)
data("lending_club")
set.seed(1)
lending_club <- sample_n(lending_club, 1000)
folds <- vfold_cv(lending_club, v = 5)
lr_mod <-
linear_reg(penalty = tune(), mixture = tune()) %>%
set_engine("glmnet") %>%
workflow(
preprocessor = funded_amnt ~ int_rate + total_bal_il,
spec = .
) %>%
tune_grid(
resamples = folds,
control = control_stack_grid(),
grid = 4
)
lr_stack <- stacks() %>%
add_candidates(lr_mod) %>%
blend_predictions() %>%
fit_members()
write_rds(lr_stack, file = "saved_mod.Rds")
saved_lr_stack <- read_rds("saved_mod.Rds")
obj_sizes(lr_stack, saved_lr_stack)
#> * 2,775,256 B
#> * 26,344,704 B
obj_size(saved_lr_stack) / obj_size(lr_stack)
#> 9.529476 B
Looking a bit further, it’s just the coefs
object that’s
increasing in size. In a model stack, $coefs
is a parsnip model
fit of a linear_reg
using the glmnet engine.
weigh(lr_stack)
#> # A tibble: 374 × 2
#> object size
#> <chr> <dbl>
#> 1 coefs.preproc.terms 2.64
#> 2 coefs.fit.call 2.64
#> 3 coefs.spec.eng_args.lower.limits 2.64
#> 4 coefs.spec.method.fit.args.lower.limits 2.64
#> 5 coefs.spec.method.pred.numeric.post 1.76
#> 6 member_fits.lr_mod_1_3.fit.fit.spec.method.pred.numeric.post 1.76
#> 7 member_fits.lr_mod_1_1.fit.fit.spec.method.pred.numeric.post 1.76
#> 8 member_fits.lr_mod_1_3.pre.actions.formula.blueprint.mold.process 0.0172
#> 9 member_fits.lr_mod_1_3.pre.mold.blueprint.mold.process 0.0172
#> 10 member_fits.lr_mod_1_1.pre.actions.formula.blueprint.mold.process 0.0172
#> # … with 364 more rows
weigh(saved_lr_stack)
#> # A tibble: 374 × 2
#> object size
#> <chr> <dbl>
#> 1 coefs.preproc.terms 24.7
#> 2 coefs.fit.call 24.7
#> 3 coefs.spec.eng_args.lower.limits 24.7
#> 4 coefs.spec.method.fit.args.lower.limits 24.7
#> 5 coefs.spec.method.pred.numeric.post 1.79
#> 6 member_fits.lr_mod_1_3.fit.fit.spec.method.pred.numeric.post 1.79
#> 7 member_fits.lr_mod_1_1.fit.fit.spec.method.pred.numeric.post 1.79
#> 8 member_fits.lr_mod_1_3.pre.actions.formula.blueprint.mold.process 0.0172
#> 9 member_fits.lr_mod_1_3.pre.mold.blueprint.mold.process 0.0172
#> 10 member_fits.lr_mod_1_1.pre.actions.formula.blueprint.mold.process 0.0172
#> # … with 364 more rows
Does the same thing happen if we just pull out the coefs
and save it independently?
lr_stack_coefs <- lr_stack$coefs
class(lr_stack_coefs)
#> [1] "_elnet" "model_fit"
write_rds(lr_stack_coefs, file = "saved_mod.Rds")
saved_lr_stack_coefs <- read_rds("saved_mod.Rds")
obj_sizes(lr_stack_coefs, saved_lr_stack_coefs)
#> * 2,635,264 B
#> * 24,601,432 B
obj_size(saved_lr_stack_coefs) / obj_size(lr_stack_coefs)
#> 9.37316 B
The same size increase happens, and we see that most of it is due to the call:
weigh(lr_stack_coefs)
#> # A tibble: 12 × 2
#> object size
#> <chr> <dbl>
#> 1 call 2.64
#> 2 beta 0.0111
#> 3 a0 0.00741
#> 4 lambda 0.000848
#> 5 dev.ratio 0.000848
#> 6 df 0.000448
#> 7 dim 0.000056
#> 8 nulldev 0.000056
#> 9 npasses 0.000056
#> 10 jerr 0.000056
#> 11 offset 0.000056
#> 12 nobs 0.000056
weigh(saved_lr_stack_coefs)
#> # A tibble: 12 × 2
#> object size
#> <chr> <dbl>
#> 1 call 24.7
#> 2 beta 0.0111
#> 3 a0 0.00741
#> 4 lambda 0.000848
#> 5 dev.ratio 0.000848
#> 6 df 0.000448
#> 7 dim 0.000056
#> 8 nulldev 0.000056
#> 9 npasses 0.000056
#> 10 jerr 0.000056
#> 11 offset 0.000056
#> 12 nobs 0.000056
Is this just a parsnip + glmnet thing?
mod_fit <-
linear_reg(penalty = tune()) %>%
set_engine("glmnet") %>%
fit(funded_amnt ~ int_rate + total_bal_il, data = lending_club)
write_rds(mod_fit, file = "saved_mod.Rds")
saved_mod_fit <- read_rds("saved_mod.Rds")
obj_sizes(mod_fit, saved_mod_fit)
#> * 1,782,240 B
#> * 1,743,872 B
Nope. Hmmm
Does the object continue growing in size with each save?
write_rds(lr_stack_coefs, file = "saved_mod_1.Rds")
lr_stack_coefs_1 <- read_rds("saved_mod_1.Rds")
write_rds(lr_stack_coefs_1, file = "saved_mod_2.Rds")
lr_stack_coefs_2 <- read_rds("saved_mod_2.Rds")
obj_sizes(lr_stack_coefs, lr_stack_coefs_1, lr_stack_coefs_2)
#> * 2,635,264 B
#> * 24,601,432 B
#> * 24,601,432 B
What if we just use butcher to axe the call object?
lr_stack_coefs_b <- butcher(lr_stack_coefs)
write_rds(lr_stack_coefs_b, file = "saved_mod.Rds")
saved_lr_stack_coefs_b <- read_rds("saved_mod.Rds")
obj_sizes(lr_stack_coefs_b, saved_lr_stack_coefs_b)
#> * 2,636,472 B
#> * 24,599,936 B
obj_size(saved_lr_stack_coefs_b) / obj_size(lr_stack_coefs_b)
#> 9.368441 B
The butcher::weigh
results of the butchered object are identical before and
after the saving, even though the object sizes look way different.
weigh(lr_stack_coefs_b)
#> # A tibble: 12 × 2
#> object size
#> <chr> <dbl>
#> 1 beta 0.0111
#> 2 a0 0.00741
#> 3 lambda 0.000848
#> 4 dev.ratio 0.000848
#> 5 df 0.000448
#> 6 call 0.000112
#> 7 dim 0.000056
#> 8 nulldev 0.000056
#> 9 npasses 0.000056
#> 10 jerr 0.000056
#> 11 offset 0.000056
#> 12 nobs 0.000056
weigh(saved_lr_stack_coefs_b)
#> # A tibble: 12 × 2
#> object size
#> <chr> <dbl>
#> 1 beta 0.0111
#> 2 a0 0.00741
#> 3 lambda 0.000848
#> 4 dev.ratio 0.000848
#> 5 df 0.000448
#> 6 call 0.000112
#> 7 dim 0.000056
#> 8 nulldev 0.000056
#> 9 npasses 0.000056
#> 10 jerr 0.000056
#> 11 offset 0.000056
#> 12 nobs 0.000056
Is this just a particularity of the obj_size
code? Use base:
object.size(lr_stack_coefs_b)
#> 150360 bytes
object.size(saved_lr_stack_coefs_b)
#> 150360 bytes
Oh—got it.
There’s something about call
’s environment in coefs
that persists
after axing call
itself. Can we get away with dropping whatever
environment that is at blend_predictions
and moving on?
Created on 2022-04-21 by the reprex package (v2.0.1)