Skip to content

Instantly share code, notes, and snippets.

@DavZim
Last active April 22, 2020 13:03
Show Gist options
  • Save DavZim/0aef21c47721b7a703d85b0f164b34c1 to your computer and use it in GitHub Desktop.
Save DavZim/0aef21c47721b7a703d85b0f164b34c1 to your computer and use it in GitHub Desktop.
Hashing tidymodels workflows
library(tidymodels)
#> ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.1.0 ──
#> ✓ broom     0.5.5     ✓ recipes   0.1.9
#> ✓ dials     0.0.4     ✓ rsample   0.0.5
#> ✓ dplyr     0.8.5     ✓ tibble    2.1.3
#> ✓ ggplot2   3.3.0     ✓ tune      0.0.1
#> ✓ infer     0.5.1     ✓ workflows 0.1.1
#> ✓ parsnip   0.0.5     ✓ yardstick 0.0.6
#> ✓ purrr     0.3.3
#> ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard()    masks scales::discard()
#> x dplyr::filter()     masks stats::filter()
#> x dplyr::lag()        masks stats::lag()
#> x ggplot2::margin()   masks dials::margin()
#> x recipes::step()     masks stats::step()
#> x recipes::yj_trans() masks scales::yj_trans()

Setup and train models

rec <- recipe(mpg ~ ., data = mtcars)
mdl <- rand_forest() %>% 
  set_engine("ranger")

wflow <- workflow() %>% 
  add_recipe(rec) %>% 
  add_model(mdl)

set.seed(123)
model1 <- fit(wflow, mtcars)

set.seed(123)
model2 <- fit(wflow, mtcars)

## Iterate over a list and hash each list item
# crc32 is only used as it produces a short hash
list_hash <- function(x, algo = "crc32") sapply(x, digest::digest, algo = algo)

Check first level model

comparing the different hashes, we see that only model$fit changes

list(
  model1 = list_hash(model1),
  model2 = list_hash(model2)
)
#> $model1
#>        pre        fit       post    trained 
#> "70872185" "ad3ca3be" "25ebfb78" "a49c55d7" 
#> 
#> $model2
#>        pre        fit       post    trained 
#> "70872185" "f30d6fdf" "25ebfb78" "a49c55d7"

Check second level (model$fit)

comparing the different hashes, we see that only model$fit$fit changes

list(
  model1 = list_hash(model1$fit),
  model2 = list_hash(model2$fit)
)
#> $model1
#>    actions        fit 
#> "25b54439" "13ce6ea7" 
#> 
#> $model2
#>    actions        fit 
#> "25b54439" "11cc2d41"

Check third level (model$fit$fit)

comparing the different hashes, we see that only model$fit$fit$elapsed changes

list(
  model1 = list_hash(model1$fit$fit),
  model2 = list_hash(model2$fit$fit)
)
#> $model1
#>        lvl       spec        fit    preproc    elapsed 
#> "7b410007" "627da451" "e9524435" "9303470a" "060e0968" 
#> 
#> $model2
#>        lvl       spec        fit    preproc    elapsed 
#> "7b410007" "627da451" "e9524435" "9303470a" "c30f17f1"

Possible/simple solution

## hash the model without the elapsed time
# crc32 is only used as it produces a short hash
hash_model <- function(x, algo = "crc32") {
  x$fit$fit$elapsed <- NA
  ll <- list(
    x$pre,
    x$fit,
    x$post,
    x$trained
  )
  digest::digest(ll, algo)
}

# Now the hashes are identical
hash_model(model1)
#> [1] "0c35cd41"
hash_model(model2)
#> [1] "0c35cd41"

Created on 2020-03-19 by the reprex package (v0.3.0)

Addendum

When we save & load a workflow, add_step() calls (saved as a quosure) will receive a new environment internally, thus the hash of the model will change, although the model does not change.

An updated version of the hash_model function would look like this:

hash_model <- function(x, algo = "crc32") {
  x$fit$fit$elapsed <- NA
  # convert quosures to labels.
  remove_quos <- function(x) lapply(x, as_label)
  x$pre$actions$recipe$recipe$steps <- remove_quos(x$pre$actions$recipe$recipe$steps)
  x$pre$mold$blueprint$recipe$steps <- remove_quos(x$pre$mold$blueprint$recipe$steps)
  
  # post steps not totally clear...
  x$post$actions$recipe$recipe$steps <- remove_quos(x$post$actions$recipe$recipe$steps)
  x$post$actions$blueprint$recipe$steps <- remove_quos(x$post$actions$blueprint$recipe$steps)

  digest::digest(x, algo)
}

tmp <- tempfile()
saveRDS(model, tmp)
model2 <- readRDS(tmp)
hash_model(model)
hash_model(model2)
@DavZim
Copy link
Author

DavZim commented Apr 22, 2020

As a comment to the addendum, here a script that looks at all differing slots:

##################################################################
# This script compares and highlights differences in lists...
##################################################################

library(tidymodels)
#> ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.1.0 ──
#> ✓ broom     0.5.5     ✓ recipes   0.1.9
#> ✓ dials     0.0.4     ✓ rsample   0.0.5
#> ✓ dplyr     0.8.5     ✓ tibble    2.1.3
#> ✓ ggplot2   3.3.0     ✓ tune      0.0.1
#> ✓ infer     0.5.1     ✓ workflows 0.1.1
#> ✓ parsnip   0.0.5     ✓ yardstick 0.0.6
#> ✓ purrr     0.3.3
#> ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard()    masks scales::discard()
#> x dplyr::filter()     masks stats::filter()
#> x dplyr::lag()        masks stats::lag()
#> x ggplot2::margin()   masks dials::margin()
#> x recipes::step()     masks stats::step()
#> x recipes::yj_trans() masks scales::yj_trans()

rec <- recipe(mpg ~ ., data = mtcars) %>% 
   step_center(all_predictors())

mdl <- rand_forest() %>% 
   set_engine("ranger")

wflow <- workflow() %>% 
   add_recipe(rec) %>% 
   add_model(mdl)

set.seed(123)
model1 <- fit(wflow, mtcars)

set.seed(123)
model2 <- fit(wflow, mtcars)

set.seed(123)
model3 <- fit(wflow, mtcars)

f <- tempfile()
saveRDS(model3, f)
model3_loaded <- readRDS(f)


# Hashes each element of a list and compares the results in a data.frame
#
# @param x a list of different elements
# @param sub a name or an index number of a sublist, can also be a vector or list
# @param algo the algorithm to use for hashing
# 
# @return a tibble containing the hashed values.
#  input - refers to the name of the element of x
#  input_hash - the hash value of the element of x
#  additional columns - the name of the column refers to the name of the slot (if the name is 'slot_1' or similar, it means that the first element was used),
#                       the values are the hashes
# if sub was provided, the slot name is given in slot
# @export
#
# @examples
# x <- list(
#   el2 = list(x = 1, lvl2 = list(y = 2, lvl3 = list(z = 3, w = 4))),
#   el1 = list(x = 1, lvl2 = list(y = 2, lvl3 = list(z = 3, w = 5)))
# )
# 
# list_hash(x)
# list_hash(x, "lvl2")
# list_hash(x, c("lvl2", "lvl3"))
# 
# # sub can also be used as a list (allows for numeric addresses where names are not available)
# list_hash(x, list(2, "lvl3"))
# 
list_hash <- function(x, sub = "", algo = "crc32") {
   library(dplyr)
   library(tibble)
   library(digest)
   library(purrr)
   
   # if sub is set, go to the "directory"
   if (length(sub) > 1 || sub != "") {
      for (s in sub) x <- map(x, s)
      sub_str <- paste(sapply(sub, function(.x) ifelse(is.numeric(.x), 
                                                       paste0("[[", .x, "]]"),
                                                       paste0("$", .x))), 
                       collapse = "")
      sub_str <- substr(sub_str, 2, nchar(sub_str))
   }
   
   hash_impl <- function(xx) {
      if (is.null(names(xx)) || names(xx) == "") names(xx) <- paste("slot", 1:length(xx), sep = "_")
      map_df(xx, digest::digest, algo = algo)
   }
   res <- map(x, hash_impl) %>% 
      bind_rows(.id = "input") %>% 
      mutate(input_hash = map_chr(x, digest::digest, algo = algo)) %>% 
      select(input, input_hash, everything())
   
   if (length(sub) > 1 || sub != "") 
      res <- res %>% 
      mutate(slot = sub_str) %>% 
      select(input, input_hash, slot, everything())
   res
}



# find the differences between the hashes of the three models
ll <- list(m1 = model1, m2 = model2, m3l = model3_loaded)

list_hash(ll) # pre, fit differ 
#> # A tibble: 3 x 6
#>   input input_hash pre      fit      post     trained 
#>   <chr> <chr>      <chr>    <chr>    <chr>    <chr>   
#> 1 m1    70d8cd04   330b4fc9 b0ac3bb9 25ebfb78 a49c55d7
#> 2 m2    2e79ace2   330b4fc9 e312eafc 25ebfb78 a49c55d7
#> 3 m3l   bbf4c1f9   52f8d429 96b8a2b7 25ebfb78 a49c55d7

# pre ###################################
list_hash(ll, sub = "pre") # actions, mold differ
#> # A tibble: 3 x 5
#>   input input_hash slot  actions  mold    
#>   <chr> <chr>      <chr> <chr>    <chr>   
#> 1 m1    330b4fc9   pre   d45a8688 2f919dff
#> 2 m2    330b4fc9   pre   d45a8688 2f919dff
#> 3 m3l   52f8d429   pre   ffcffb5f 6b262fdc

## pre - actions ###################################
list_hash(ll, sub = c("pre", "actions"))
#> # A tibble: 3 x 4
#>   input input_hash slot        recipe  
#>   <chr> <chr>      <chr>       <chr>   
#> 1 m1    d45a8688   pre$actions 8f61e182
#> 2 m2    d45a8688   pre$actions 8f61e182
#> 3 m3l   ffcffb5f   pre$actions d50989f0
list_hash(ll, sub = c("pre", "actions", "recipe"))
#> # A tibble: 3 x 5
#>   input input_hash slot               recipe   blueprint
#>   <chr> <chr>      <chr>              <chr>    <chr>    
#> 1 m1    8f61e182   pre$actions$recipe d4ecf991 551ba93a 
#> 2 m2    8f61e182   pre$actions$recipe d4ecf991 551ba93a 
#> 3 m3l   d50989f0   pre$actions$recipe b0172194 551ba93a
list_hash(ll, sub = c("pre", "actions", "recipe", "recipe"))
#> # A tibble: 3 x 9
#>   input input_hash slot       var_info term_info steps  template levels retained
#>   <chr> <chr>      <chr>      <chr>    <chr>     <chr>  <chr>    <chr>  <chr>   
#> 1 m1    d4ecf991   pre$actio… e22b2a16 e22b2a16  df267… 2df10b65 7b410… 3ec2d37a
#> 2 m2    d4ecf991   pre$actio… e22b2a16 e22b2a16  df267… 2df10b65 7b410… 3ec2d37a
#> 3 m3l   b0172194   pre$actio… e22b2a16 e22b2a16  6da28… 2df10b65 7b410… 3ec2d37a
list_hash(ll, sub = c("pre", "actions", "recipe", "recipe", "steps"))
#> # A tibble: 3 x 4
#>   input input_hash slot                            slot_1  
#>   <chr> <chr>      <chr>                           <chr>   
#> 1 m1    df2679a3   pre$actions$recipe$recipe$steps ade9a81b
#> 2 m2    df2679a3   pre$actions$recipe$recipe$steps ade9a81b
#> 3 m3l   6da28f62   pre$actions$recipe$recipe$steps cb466d81
list_hash(ll, sub = list("pre", "actions", "recipe", "recipe", "steps", 1))
#> # A tibble: 3 x 10
#>   input input_hash slot          terms  role   trained  means  na_rm skip  id   
#>   <chr> <chr>      <chr>         <chr>  <chr>  <chr>    <chr>  <chr> <chr> <chr>
#> 1 m1    ade9a81b   pre$actions$… 9ae63… 3ec2d… d39b6541 7b410… a49c… d39b… d564…
#> 2 m2    ade9a81b   pre$actions$… 9ae63… 3ec2d… d39b6541 7b410… a49c… d39b… d564…
#> 3 m3l   cb466d81   pre$actions$… b2da1… 3ec2d… d39b6541 7b410… a49c… d39b… d564…
list_hash(ll, sub = list("pre", "actions", "recipe", "recipe", "steps", 1, "terms"))
#> # A tibble: 3 x 4
#>   input input_hash slot                                       slot_1  
#>   <chr> <chr>      <chr>                                      <chr>   
#> 1 m1    9ae636cb   pre$actions$recipe$recipe$steps[[1]]$terms a19dd51f
#> 2 m2    9ae636cb   pre$actions$recipe$recipe$steps[[1]]$terms a19dd51f
#> 3 m3l   b2da122f   pre$actions$recipe$recipe$steps[[1]]$terms 31c59f78
list_hash(ll, sub = list("pre", "actions", "recipe", "recipe", "steps", 1, "terms", 1)) # environments differ
#> # A tibble: 3 x 5
#>   input input_hash slot                                         slot_1   slot_2 
#>   <chr> <chr>      <chr>                                        <chr>    <chr>  
#> 1 m1    a19dd51f   pre$actions$recipe$recipe$steps[[1]]$terms[… ed551ff4 88ef94…
#> 2 m2    a19dd51f   pre$actions$recipe$recipe$steps[[1]]$terms[… ed551ff4 88ef94…
#> 3 m3l   31c59f78   pre$actions$recipe$recipe$steps[[1]]$terms[… ed551ff4 88ef94…
# look at the different values
lapply(ll, function(x) x$pre$actions$recipe$recipe$steps[[1]]$terms[[1]])
#> $m1
#> <quosure>
#> expr: ^all_predictors()
#> env:  0x55908dd43b60
#> 
#> $m2
#> <quosure>
#> expr: ^all_predictors()
#> env:  0x55908dd43b60
#> 
#> $m3l
#> <quosure>
#> expr: ^all_predictors()
#> env:  0x55908f8c2f50

## pre - mold ###################################
list_hash(ll, sub = c("pre", "mold"))
#> # A tibble: 3 x 7
#>   input input_hash slot     predictors outcomes blueprint extras  
#>   <chr> <chr>      <chr>    <chr>      <chr>    <chr>     <chr>   
#> 1 m1    2f919dff   pre$mold 83f86366   7174f84d 00e470fa  6ba767c2
#> 2 m2    2f919dff   pre$mold 83f86366   7174f84d 00e470fa  6ba767c2
#> 3 m3l   6b262fdc   pre$mold 83f86366   7174f84d f1a514f6  6ba767c2
list_hash(ll, sub = c("pre", "mold", "blueprint"))
#> # A tibble: 3 x 11
#>   input input_hash slot  mold  forge intercept allow_novel_lev… ptypes fresh
#>   <chr> <chr>      <chr> <chr> <chr> <chr>     <chr>            <chr>  <chr>
#> 1 m1    00e470fa   pre$… 8e10… 1f4b… d39b6541  d39b6541         edeac… a49c…
#> 2 m2    00e470fa   pre$… 8e10… 1f4b… d39b6541  d39b6541         edeac… a49c…
#> 3 m3l   f1a514f6   pre$… 8e10… 1f4b… d39b6541  d39b6541         edeac… a49c…
#> # … with 2 more variables: recipe <chr>, extra_role_ptypes <chr>
list_hash(ll, sub = c("pre", "mold", "blueprint", "recipe"))
#> # A tibble: 3 x 10
#>   input input_hash slot  var_info term_info steps retained tr_info orig_lvls
#>   <chr> <chr>      <chr> <chr>    <chr>     <chr> <chr>    <chr>   <chr>    
#> 1 m1    cfa3f736   pre$… e22b2a16 16212478  3a06… d39b6541 a07cbb… 1d33562e 
#> 2 m2    cfa3f736   pre$… e22b2a16 16212478  3a06… d39b6541 a07cbb… 1d33562e 
#> 3 m3l   bcdd9a87   pre$… e22b2a16 16212478  686c… d39b6541 a07cbb… 1d33562e 
#> # … with 1 more variable: last_term_info <chr>
list_hash(ll, sub = c("pre", "mold", "blueprint", "recipe", "steps"))
#> # A tibble: 3 x 4
#>   input input_hash slot                            slot_1  
#>   <chr> <chr>      <chr>                           <chr>   
#> 1 m1    3a0638cf   pre$mold$blueprint$recipe$steps acb74ba4
#> 2 m2    3a0638cf   pre$mold$blueprint$recipe$steps acb74ba4
#> 3 m3l   686cf2b4   pre$mold$blueprint$recipe$steps 364efd37
list_hash(ll, sub = list("pre", "mold", "blueprint", "recipe", "steps", 1))
#> # A tibble: 3 x 10
#>   input input_hash slot          terms  role   trained  means  na_rm skip  id   
#>   <chr> <chr>      <chr>         <chr>  <chr>  <chr>    <chr>  <chr> <chr> <chr>
#> 1 m1    acb74ba4   pre$mold$blu… 9ae63… 3ec2d… a49c55d7 d9df3… a49c… d39b… d564…
#> 2 m2    acb74ba4   pre$mold$blu… 9ae63… 3ec2d… a49c55d7 d9df3… a49c… d39b… d564…
#> 3 m3l   364efd37   pre$mold$blu… b2da1… 3ec2d… a49c55d7 d9df3… a49c… d39b… d564…
list_hash(ll, sub = list("pre", "mold", "blueprint", "recipe", "steps", 1, "terms"))
#> # A tibble: 3 x 4
#>   input input_hash slot                                       slot_1  
#>   <chr> <chr>      <chr>                                      <chr>   
#> 1 m1    9ae636cb   pre$mold$blueprint$recipe$steps[[1]]$terms a19dd51f
#> 2 m2    9ae636cb   pre$mold$blueprint$recipe$steps[[1]]$terms a19dd51f
#> 3 m3l   b2da122f   pre$mold$blueprint$recipe$steps[[1]]$terms 31c59f78
list_hash(ll, sub = list("pre", "mold", "blueprint", "recipe", "steps", 1, "terms", 1)) # environments differ
#> # A tibble: 3 x 5
#>   input input_hash slot                                         slot_1   slot_2 
#>   <chr> <chr>      <chr>                                        <chr>    <chr>  
#> 1 m1    a19dd51f   pre$mold$blueprint$recipe$steps[[1]]$terms[… ed551ff4 88ef94…
#> 2 m2    a19dd51f   pre$mold$blueprint$recipe$steps[[1]]$terms[… ed551ff4 88ef94…
#> 3 m3l   31c59f78   pre$mold$blueprint$recipe$steps[[1]]$terms[… ed551ff4 88ef94…
# look at the difference
lapply(ll, function(x) x$pre$mold$blueprint$recipe$steps[[1]]$terms[[1]])
#> $m1
#> <quosure>
#> expr: ^all_predictors()
#> env:  0x55908dd43b60
#> 
#> $m2
#> <quosure>
#> expr: ^all_predictors()
#> env:  0x55908dd43b60
#> 
#> $m3l
#> <quosure>
#> expr: ^all_predictors()
#> env:  0x55908f8c2f50

# fit #######################################
list_hash(ll, sub = "fit")
#> # A tibble: 3 x 5
#>   input input_hash slot  actions  fit     
#>   <chr> <chr>      <chr> <chr>    <chr>   
#> 1 m1    b0ac3bb9   fit   25b54439 8fb19b33
#> 2 m2    e312eafc   fit   25b54439 f4971531
#> 3 m3l   96b8a2b7   fit   25b54439 f08ab542
list_hash(ll, sub = c("fit", "fit"))
#> # A tibble: 3 x 8
#>   input input_hash slot    lvl      spec     fit      preproc  elapsed 
#>   <chr> <chr>      <chr>   <chr>    <chr>    <chr>    <chr>    <chr>   
#> 1 m1    8fb19b33   fit$fit 7b410007 627da451 6bda4a02 9303470a ad86246d
#> 2 m2    f4971531   fit$fit 7b410007 627da451 6bda4a02 9303470a a18ba9b0
#> 3 m3l   f08ab542   fit$fit 7b410007 627da451 6bda4a02 9303470a 6faa7c28
list_hash(ll, sub = c("fit", "fit", "elapsed"))
#> # A tibble: 3 x 8
#>   input input_hash slot          user.self sys.self elapsed user.child sys.child
#>   <chr> <chr>      <chr>         <chr>     <chr>    <chr>   <chr>      <chr>    
#> 1 m1    ad86246d   fit$fit$elap… 22c88f1e  012653a6 14d0c8… 012653a6   012653a6 
#> 2 m2    a18ba9b0   fit$fit$elap… 44c9d436  012653a6 632be6… 012653a6   012653a6 
#> 3 m3l   6faa7c28   fit$fit$elap… cbd14eed  88065d10 632be6… 012653a6   012653a6
# look at the differences
lapply(ll, function(x) x$fit$fit$elapsed)
#> $m1
#>    user  system elapsed 
#>   0.019   0.000   0.017 
#> 
#> $m2
#>    user  system elapsed 
#>   0.015   0.000   0.013 
#> 
#> $m3l
#>    user  system elapsed 
#>   0.012   0.004   0.013

Created on 2020-04-22 by the reprex package (v0.3.0)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment