Skip to content

Instantly share code, notes, and snippets.

@simonpcouch
Last active April 21, 2022 18:23
Show Gist options
  • Save simonpcouch/e6c3aea7eccc68d101a1ee40c27ce622 to your computer and use it in GitHub Desktop.
Save simonpcouch/e6c3aea7eccc68d101a1ee40c27ce622 to your computer and use it in GitHub Desktop.

An issue was recently filed in stacks about the object size of a stack increasing on save and reload.

Starting out with a quick reprex:

library(tidymodels)
library(modeldata)
library(readr)
#> 
#> Attaching package: 'readr'
#> The following object is masked from 'package:yardstick':
#> 
#>     spec
#> The following object is masked from 'package:scales':
#> 
#>     col_factor
library(lobstr)
library(stacks)
library(butcher)

data("lending_club")

set.seed(1)
lending_club <- sample_n(lending_club, 1000)

folds <- vfold_cv(lending_club, v = 5)

lr_mod <- 
  linear_reg(penalty = tune(), mixture = tune()) %>%
  set_engine("glmnet") %>%
  workflow(
    preprocessor = funded_amnt ~ int_rate + total_bal_il,
    spec = .
  ) %>%
  tune_grid(
    resamples = folds,
    control = control_stack_grid(),
    grid = 4
  )

lr_stack <- stacks() %>%
  add_candidates(lr_mod) %>%
  blend_predictions() %>%
  fit_members()

write_rds(lr_stack, file = "saved_mod.Rds")

saved_lr_stack <- read_rds("saved_mod.Rds")

obj_sizes(lr_stack, saved_lr_stack)
#> *  2,775,256 B
#> * 26,344,704 B

obj_size(saved_lr_stack) / obj_size(lr_stack)
#> 9.529476 B

Looking a bit further, it’s just the coefs object that’s increasing in size. In a model stack, $coefs is a parsnip model fit of a linear_reg using the glmnet engine.

weigh(lr_stack)
#> # A tibble: 374 × 2
#>    object                                                              size
#>    <chr>                                                              <dbl>
#>  1 coefs.preproc.terms                                               2.64  
#>  2 coefs.fit.call                                                    2.64  
#>  3 coefs.spec.eng_args.lower.limits                                  2.64  
#>  4 coefs.spec.method.fit.args.lower.limits                           2.64  
#>  5 coefs.spec.method.pred.numeric.post                               1.76  
#>  6 member_fits.lr_mod_1_3.fit.fit.spec.method.pred.numeric.post      1.76  
#>  7 member_fits.lr_mod_1_1.fit.fit.spec.method.pred.numeric.post      1.76  
#>  8 member_fits.lr_mod_1_3.pre.actions.formula.blueprint.mold.process 0.0172
#>  9 member_fits.lr_mod_1_3.pre.mold.blueprint.mold.process            0.0172
#> 10 member_fits.lr_mod_1_1.pre.actions.formula.blueprint.mold.process 0.0172
#> # … with 364 more rows
weigh(saved_lr_stack)
#> # A tibble: 374 × 2
#>    object                                                               size
#>    <chr>                                                               <dbl>
#>  1 coefs.preproc.terms                                               24.7   
#>  2 coefs.fit.call                                                    24.7   
#>  3 coefs.spec.eng_args.lower.limits                                  24.7   
#>  4 coefs.spec.method.fit.args.lower.limits                           24.7   
#>  5 coefs.spec.method.pred.numeric.post                                1.79  
#>  6 member_fits.lr_mod_1_3.fit.fit.spec.method.pred.numeric.post       1.79  
#>  7 member_fits.lr_mod_1_1.fit.fit.spec.method.pred.numeric.post       1.79  
#>  8 member_fits.lr_mod_1_3.pre.actions.formula.blueprint.mold.process  0.0172
#>  9 member_fits.lr_mod_1_3.pre.mold.blueprint.mold.process             0.0172
#> 10 member_fits.lr_mod_1_1.pre.actions.formula.blueprint.mold.process  0.0172
#> # … with 364 more rows

Does the same thing happen if we just pull out the coefs and save it independently?

lr_stack_coefs <- lr_stack$coefs

class(lr_stack_coefs)
#> [1] "_elnet"    "model_fit"

write_rds(lr_stack_coefs, file = "saved_mod.Rds")

saved_lr_stack_coefs <- read_rds("saved_mod.Rds")

obj_sizes(lr_stack_coefs, saved_lr_stack_coefs)
#> *  2,635,264 B
#> * 24,601,432 B

obj_size(saved_lr_stack_coefs) / obj_size(lr_stack_coefs)
#> 9.37316 B

The same size increase happens, and we see that most of it is due to the call:

weigh(lr_stack_coefs)
#> # A tibble: 12 × 2
#>    object        size
#>    <chr>        <dbl>
#>  1 call      2.64    
#>  2 beta      0.0111  
#>  3 a0        0.00741 
#>  4 lambda    0.000848
#>  5 dev.ratio 0.000848
#>  6 df        0.000448
#>  7 dim       0.000056
#>  8 nulldev   0.000056
#>  9 npasses   0.000056
#> 10 jerr      0.000056
#> 11 offset    0.000056
#> 12 nobs      0.000056
weigh(saved_lr_stack_coefs)
#> # A tibble: 12 × 2
#>    object         size
#>    <chr>         <dbl>
#>  1 call      24.7     
#>  2 beta       0.0111  
#>  3 a0         0.00741 
#>  4 lambda     0.000848
#>  5 dev.ratio  0.000848
#>  6 df         0.000448
#>  7 dim        0.000056
#>  8 nulldev    0.000056
#>  9 npasses    0.000056
#> 10 jerr       0.000056
#> 11 offset     0.000056
#> 12 nobs       0.000056

Is this just a parsnip + glmnet thing?

mod_fit <-
  linear_reg(penalty = tune()) %>%
  set_engine("glmnet") %>%
  fit(funded_amnt ~ int_rate + total_bal_il, data = lending_club)

write_rds(mod_fit, file = "saved_mod.Rds")

saved_mod_fit <- read_rds("saved_mod.Rds")

obj_sizes(mod_fit, saved_mod_fit)
#> * 1,782,240 B
#> * 1,743,872 B

Nope. Hmmm

Does the object continue growing in size with each save?

write_rds(lr_stack_coefs, file = "saved_mod_1.Rds")

lr_stack_coefs_1 <- read_rds("saved_mod_1.Rds")

write_rds(lr_stack_coefs_1, file = "saved_mod_2.Rds")

lr_stack_coefs_2 <- read_rds("saved_mod_2.Rds")

obj_sizes(lr_stack_coefs, lr_stack_coefs_1, lr_stack_coefs_2)
#> *  2,635,264 B
#> * 24,601,432 B
#> * 24,601,432 B

What if we just use butcher to axe the call object?

lr_stack_coefs_b <- butcher(lr_stack_coefs)

write_rds(lr_stack_coefs_b, file = "saved_mod.Rds")

saved_lr_stack_coefs_b <- read_rds("saved_mod.Rds")

obj_sizes(lr_stack_coefs_b, saved_lr_stack_coefs_b)
#> *  2,636,472 B
#> * 24,599,936 B

obj_size(saved_lr_stack_coefs_b) / obj_size(lr_stack_coefs_b)
#> 9.368441 B

The butcher::weigh results of the butchered object are identical before and after the saving, even though the object sizes look way different.

weigh(lr_stack_coefs_b)
#> # A tibble: 12 × 2
#>    object        size
#>    <chr>        <dbl>
#>  1 beta      0.0111  
#>  2 a0        0.00741 
#>  3 lambda    0.000848
#>  4 dev.ratio 0.000848
#>  5 df        0.000448
#>  6 call      0.000112
#>  7 dim       0.000056
#>  8 nulldev   0.000056
#>  9 npasses   0.000056
#> 10 jerr      0.000056
#> 11 offset    0.000056
#> 12 nobs      0.000056
weigh(saved_lr_stack_coefs_b)
#> # A tibble: 12 × 2
#>    object        size
#>    <chr>        <dbl>
#>  1 beta      0.0111  
#>  2 a0        0.00741 
#>  3 lambda    0.000848
#>  4 dev.ratio 0.000848
#>  5 df        0.000448
#>  6 call      0.000112
#>  7 dim       0.000056
#>  8 nulldev   0.000056
#>  9 npasses   0.000056
#> 10 jerr      0.000056
#> 11 offset    0.000056
#> 12 nobs      0.000056

Is this just a particularity of the obj_size code? Use base:

object.size(lr_stack_coefs_b)
#> 150360 bytes
object.size(saved_lr_stack_coefs_b)
#> 150360 bytes

Oh—got it.

There’s something about call’s environment in coefs that persists after axing call itself. Can we get away with dropping whatever environment that is at blend_predictions and moving on?

Created on 2022-04-21 by the reprex package (v2.0.1)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment