DavZim/hash_example.md

## hash_example.md

      
    Raw
  

              hash_example.md
            
          
    library(tidymodels)
#> ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.1.0 ──
#> ✓ broom     0.5.5     ✓ recipes   0.1.9
#> ✓ dials     0.0.4     ✓ rsample   0.0.5
#> ✓ dplyr     0.8.5     ✓ tibble    2.1.3
#> ✓ ggplot2   3.3.0     ✓ tune      0.0.1
#> ✓ infer     0.5.1     ✓ workflows 0.1.1
#> ✓ parsnip   0.0.5     ✓ yardstick 0.0.6
#> ✓ purrr     0.3.3
#> ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard()    masks scales::discard()
#> x dplyr::filter()     masks stats::filter()
#> x dplyr::lag()        masks stats::lag()
#> x ggplot2::margin()   masks dials::margin()
#> x recipes::step()     masks stats::step()
#> x recipes::yj_trans() masks scales::yj_trans()
Setup and train models

rec <- recipe(mpg ~ ., data = mtcars)
mdl <- rand_forest() %>% 
  set_engine("ranger")

wflow <- workflow() %>% 
  add_recipe(rec) %>% 
  add_model(mdl)

set.seed(123)
model1 <- fit(wflow, mtcars)

set.seed(123)
model2 <- fit(wflow, mtcars)

## Iterate over a list and hash each list item
# crc32 is only used as it produces a short hash
list_hash <- function(x, algo = "crc32") sapply(x, digest::digest, algo = algo)
Check first level model

comparing the different hashes, we see that only model$fit changes
list(
  model1 = list_hash(model1),
  model2 = list_hash(model2)
)
#> $model1
#>        pre        fit       post    trained 
#> "70872185" "ad3ca3be" "25ebfb78" "a49c55d7" 
#> 
#> $model2
#>        pre        fit       post    trained 
#> "70872185" "f30d6fdf" "25ebfb78" "a49c55d7"
Check second level (model$fit)

comparing the different hashes, we see that only model$fit$fit changes
list(
  model1 = list_hash(model1$fit),
  model2 = list_hash(model2$fit)
)
#> $model1
#>    actions        fit 
#> "25b54439" "13ce6ea7" 
#> 
#> $model2
#>    actions        fit 
#> "25b54439" "11cc2d41"
Check third level (model$fit$fit)

comparing the different hashes, we see that only model$fit$fit$elapsed changes
list(
  model1 = list_hash(model1$fit$fit),
  model2 = list_hash(model2$fit$fit)
)
#> $model1
#>        lvl       spec        fit    preproc    elapsed 
#> "7b410007" "627da451" "e9524435" "9303470a" "060e0968" 
#> 
#> $model2
#>        lvl       spec        fit    preproc    elapsed 
#> "7b410007" "627da451" "e9524435" "9303470a" "c30f17f1"
Possible/simple solution

## hash the model without the elapsed time
# crc32 is only used as it produces a short hash
hash_model <- function(x, algo = "crc32") {
  x$fit$fit$elapsed <- NA
  ll <- list(
    x$pre,
    x$fit,
    x$post,
    x$trained
  )
  digest::digest(ll, algo)
}

# Now the hashes are identical
hash_model(model1)
#> [1] "0c35cd41"
hash_model(model2)
#> [1] "0c35cd41"
^{Created on 2020-03-19 by the reprex package (v0.3.0)}
Addendum

When we save & load a workflow, add_step() calls (saved as a quosure) will receive a new environment internally, thus the hash of the model will change, although the model does not change.
An updated version of the hash_model function would look like this:
hash_model <- function(x, algo = "crc32") {
  x$fit$fit$elapsed <- NA
  # convert quosures to labels.
  remove_quos <- function(x) lapply(x, as_label)
  x$pre$actions$recipe$recipe$steps <- remove_quos(x$pre$actions$recipe$recipe$steps)
  x$pre$mold$blueprint$recipe$steps <- remove_quos(x$pre$mold$blueprint$recipe$steps)
  
  # post steps not totally clear...
  x$post$actions$recipe$recipe$steps <- remove_quos(x$post$actions$recipe$recipe$steps)
  x$post$actions$blueprint$recipe$steps <- remove_quos(x$post$actions$blueprint$recipe$steps)

  digest::digest(x, algo)
}

tmp <- tempfile()
saveRDS(model, tmp)
model2 <- readRDS(tmp)
hash_model(model)
hash_model(model2)