Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
@DavisVaughan

This comment has been minimized.

Copy link
Owner Author

@DavisVaughan DavisVaughan commented Jul 16, 2020

suppressPackageStartupMessages({
  library(recipes)
  library(rsample)
  library(tidyverse)
  library(modeldata)
})

data("drinks")
set.seed(123)

# //////////////////////////////////////////////////////////////////////////////

# 1 year of data
drinks_subset <- drinks %>% 
  tail(12) %>%
  rename(x1 = S4248SM144NCEN) %>%
  mutate(x2 = sample(n()))

splits_no_overlap <- initial_time_split(drinks_subset, prop = 2/3)

training(splits_no_overlap)
#> # A tibble: 8 x 3
#>   date          x1    x2
#>   <date>     <dbl> <int>
#> 1 2016-10-01 11914     3
#> 2 2016-11-01 13025    12
#> 3 2016-12-01 14431    10
#> 4 2017-01-01  9049     2
#> 5 2017-02-01 10458     6
#> 6 2017-03-01 12489    11
#> 7 2017-04-01 11499     5
#> 8 2017-05-01 13553     4
testing(splits_no_overlap)
#> # A tibble: 4 x 3
#>   date          x1    x2
#>   <date>     <dbl> <int>
#> 1 2017-06-01 14740     9
#> 2 2017-07-01 11424     8
#> 3 2017-08-01 13412     1
#> 4 2017-09-01 11917     7

# Make a recipe that includes a lag
rec_lag <- recipe(~ ., data = training(splits_no_overlap)) %>%
  step_lag(x1, lag = 3) 

# Problem is that we have already split the data into training/testing,
# so the lagged info that could be used at test time isn't available here.
# We know the 3 month lag value of 2017-06-01, it is the value at
# 2017-03-01 of 12489, but that isn't here anymore (since it lives in the
# separated out training set).
bake(prep(rec_lag), testing(splits_no_overlap))
#> # A tibble: 4 x 4
#>   date          x1    x2 lag_3_x1
#>   <date>     <dbl> <int>    <dbl>
#> 1 2017-06-01 14740     9       NA
#> 2 2017-07-01 11424     8       NA
#> 3 2017-08-01 13412     1       NA
#> 4 2017-09-01 11917     7    14740

# //////////////////////////////////////////////////////////////////////////////

# Matt's solution
splits_with_overlap <- initial_time_split(drinks_subset, prop = 2/3, lag = 3)

# Data leakage of rows from [2017-03-01, 2017-05-01] !!!
training(splits_with_overlap)
#> # A tibble: 8 x 3
#>   date          x1    x2
#>   <date>     <dbl> <int>
#> 1 2016-10-01 11914     3
#> 2 2016-11-01 13025    12
#> 3 2016-12-01 14431    10
#> 4 2017-01-01  9049     2
#> 5 2017-02-01 10458     6
#> 6 2017-03-01 12489    11
#> 7 2017-04-01 11499     5
#> 8 2017-05-01 13553     4
testing(splits_with_overlap)
#> # A tibble: 7 x 3
#>   date          x1    x2
#>   <date>     <dbl> <int>
#> 1 2017-03-01 12489    11
#> 2 2017-04-01 11499     5
#> 3 2017-05-01 13553     4
#> 4 2017-06-01 14740     9
#> 5 2017-07-01 11424     8
#> 6 2017-08-01 13412     1
#> 7 2017-09-01 11917     7

rec_lag2 <- recipe(~ ., data = training(splits_with_overlap)) %>%
  step_lag(x1, lag = 3)

bake(prep(rec_lag2), testing(splits_with_overlap))
#> # A tibble: 7 x 4
#>   date          x1    x2 lag_3_x1
#>   <date>     <dbl> <int>    <dbl>
#> 1 2017-03-01 12489    11       NA
#> 2 2017-04-01 11499     5       NA
#> 3 2017-05-01 13553     4       NA
#> 4 2017-06-01 14740     9    12489
#> 5 2017-07-01 11424     8    11499
#> 6 2017-08-01 13412     1    13553
#> 7 2017-09-01 11917     7    14740

# !!!!!!!!
# Technically this fixes the issue. You can compute the 3 month lag
# for 2017-06-01 because `x1` for 2017-03-01 is in the testing data now.
# HOWEVER:
# - You now have 3 extra rows duplicated between training and testing
# - My big issue is that this affects more than just the x1 column. ALL
#   columns have a data leakage issue.
# - The "right" thing to do now would be to remove these rows in the recipe
#   after doing the lag, but I don't trust users to do that!
# - If you don't remove the rows, you end up training the model with data
#   for 2017-03-01, AND predicting the outcome for 2017-03-01. Bad!
# !!!!!!!

# //////////////////////////////////////////////////////////////////////////////

# Davis' solution
# Pre-lag before the recipe
drinks_subset_pre_lag <- drinks_subset %>%
  mutate(x1_lag_3_month = lag(x1, 3))

splits_pre_lag <- initial_time_split(drinks_subset_pre_lag, prop = 2/3)

# No data leakage and no extra rows in `testing()` to worry about
training(splits_pre_lag)
#> # A tibble: 8 x 4
#>   date          x1    x2 x1_lag_3_month
#>   <date>     <dbl> <int>          <dbl>
#> 1 2016-10-01 11914     3             NA
#> 2 2016-11-01 13025    12             NA
#> 3 2016-12-01 14431    10             NA
#> 4 2017-01-01  9049     2          11914
#> 5 2017-02-01 10458     6          13025
#> 6 2017-03-01 12489    11          14431
#> 7 2017-04-01 11499     5           9049
#> 8 2017-05-01 13553     4          10458
testing(splits_pre_lag)
#> # A tibble: 4 x 4
#>   date          x1    x2 x1_lag_3_month
#>   <date>     <dbl> <int>          <dbl>
#> 1 2017-06-01 14740     9          12489
#> 2 2017-07-01 11424     8          11499
#> 3 2017-08-01 13412     1          13553
#> 4 2017-09-01 11917     7          14740

# The recipe wouldn't do anything
rec3 <- recipe(~ ., data = training(splits_pre_lag))

bake(prep(rec3), testing(splits_pre_lag))
#> # A tibble: 4 x 4
#>   date          x1    x2 x1_lag_3_month
#>   <date>     <dbl> <int>          <dbl>
#> 1 2017-06-01 14740     9          12489
#> 2 2017-07-01 11424     8          11499
#> 3 2017-08-01 13412     1          13553
#> 4 2017-09-01 11917     7          14740

# //////////////////////////////////////////////////////////////////////////////

# I admit this it is not ideal to have to do this outside the recipe, but 
# I think that it is completely reasonable to do so. I don't think that
# pre-computing the lagged columns outside the recipe would have any issues.

# In production, I would imagine that when "new data" arrives and you want
# to make predictions on it, the data set that you would pass to `bake()`
# would already have those lagged variables attached to it. This seems
# completely reasonable to me.

# Additionally, and maybe more importantly, this really only causes issues
# with very small testing sets. Even if you don't do the pre-lag thing I suggest
# here, with more data in the testing set you will quickly be able to compute
# the lagged variables you care about using only the data available to you in
# the testing set. With enough data, any rows that you can't compute
# lagged variables could be removed with very little impact on the overall
# model. For example:

# All of drinks
drinks_full <- drinks %>% 
  rename(x1 = S4248SM144NCEN) %>%
  mutate(x2 = sample(n()))

splits_full <- initial_time_split(drinks_full, prop = 2/3)

training(splits_full)
#> # A tibble: 206 x 3
#>    date          x1    x2
#>    <date>     <dbl> <int>
#>  1 1992-01-01  3459    26
#>  2 1992-02-01  3458     7
#>  3 1992-03-01  4002   137
#>  4 1992-04-01  4564   254
#>  5 1992-05-01  4221   211
#>  6 1992-06-01  4529    78
#>  7 1992-07-01  4466    81
#>  8 1992-08-01  4137    43
#>  9 1992-09-01  4126   143
#> 10 1992-10-01  4259    32
#> # … with 196 more rows
testing(splits_full)
#> # A tibble: 103 x 3
#>    date          x1    x2
#>    <date>     <dbl> <int>
#>  1 2009-03-01  8688   149
#>  2 2009-04-01  9162    66
#>  3 2009-05-01  9369   269
#>  4 2009-06-01 10167   266
#>  5 2009-07-01  9507    97
#>  6 2009-08-01  8923   242
#>  7 2009-09-01  9272   251
#>  8 2009-10-01  9075     8
#>  9 2009-11-01  8949   208
#> 10 2009-12-01 10843   185
#> # … with 93 more rows

# Do the lag in the recipe
rec_full <- recipe(~ ., data = training(splits_full)) %>%
  step_lag(x1, lag = 3) 

# Ok so you get 3 rows with `NA` where "technically" you knew the value.
# That wouldn't be so bad to just throw out.
bake(prep(rec_full), testing(splits_full))
#> # A tibble: 103 x 4
#>    date          x1    x2 lag_3_x1
#>    <date>     <dbl> <int>    <dbl>
#>  1 2009-03-01  8688   149       NA
#>  2 2009-04-01  9162    66       NA
#>  3 2009-05-01  9369   269       NA
#>  4 2009-06-01 10167   266     8688
#>  5 2009-07-01  9507    97     9162
#>  6 2009-08-01  8923   242     9369
#>  7 2009-09-01  9272   251    10167
#>  8 2009-10-01  9075     8     9507
#>  9 2009-11-01  8949   208     8923
#> 10 2009-12-01 10843   185     9272
#> # … with 93 more rows

# //////////////////////////////////////////////////////////////////////////////

# Overall, I would not add this argument for three reasons:
# - I think the dangers of data leakage outweigh any benefits.
# - It is possible to do this correctly by pre-computing the lag outside
#   the recipe.
# - With medium to large data sets, even if you don't pre-compute the 
#   lag and have to throw out some data, it really isn't an issue.

Created on 2020-07-16 by the reprex package (v0.3.0)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment