Skip to content

Instantly share code, notes, and snippets.

@DavisVaughan
Last active November 2, 2017 01:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DavisVaughan/691e1d9c127705b8a0d7994a9a866234 to your computer and use it in GitHub Desktop.
Save DavisVaughan/691e1d9c127705b8a0d7994a9a866234 to your computer and use it in GitHub Desktop.
suppressPackageStartupMessages(library(tibbletime))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(purrr))
# Create a tibble of event dates, ~100k
# 2013-01-01 to the end of the year by 5 minutes
event_dates <- create_series(~2013, 5~M, force_class = "POSIXct")
event_dates
#> # A time tibble: 105,120 x 1
#> # Index: date
#> date
#> <dttm>
#> 1 2013-01-01 00:00:00
#> 2 2013-01-01 00:05:00
#> 3 2013-01-01 00:10:00
#> 4 2013-01-01 00:15:00
#> 5 2013-01-01 00:20:00
#> 6 2013-01-01 00:25:00
#> 7 2013-01-01 00:30:00
#> 8 2013-01-01 00:35:00
#> 9 2013-01-01 00:40:00
#> 10 2013-01-01 00:45:00
#> # ... with 105,110 more rows
# Create tibbles of start and end dates
# Something like a boundary every 5 days should work fine as an example
boundary_dates <- tibble(
start_dates = create_series(time_formula = ~2013,
period = 5~d,
force_class = "POSIXct",
as_date_vector = TRUE),
end_dates = create_series(time_formula = 2013-01-05 + 23:59:59 ~ 2014-01-01 + 00:00:00,
period = 5~d,
include_end = FALSE,
force_class = "POSIXct",
as_date_vector = TRUE)
)
# Let's also add an index, later we will use this to join by
boundary_dates <- boundary_dates %>%
mutate(idx = 1:nrow(boundary_dates))
boundary_dates
#> # A tibble: 73 x 3
#> start_dates end_dates idx
#> <dttm> <dttm> <int>
#> 1 2013-01-01 2013-01-05 23:59:59 1
#> 2 2013-01-06 2013-01-10 23:59:59 2
#> 3 2013-01-11 2013-01-15 23:59:59 3
#> 4 2013-01-16 2013-01-20 23:59:59 4
#> 5 2013-01-21 2013-01-25 23:59:59 5
#> 6 2013-01-26 2013-01-30 23:59:59 6
#> 7 2013-01-31 2013-02-04 23:59:59 7
#> 8 2013-02-05 2013-02-09 23:59:59 8
#> 9 2013-02-10 2013-02-14 23:59:59 9
#> 10 2013-02-15 2013-02-19 23:59:59 10
#> # ... with 63 more rows
# The goal is, for each start/end boundary row combo, find the event_dates that
# fall in that boundary.
# To do this, we need to iterate over each row. We can do that with transpose
boundary_transposed <- transpose(boundary_dates)
head(boundary_transposed)
#> [[1]]
#> [[1]]$start_dates
#> [1] 1357016400
#>
#> [[1]]$end_dates
#> [1] 1357448399
#>
#> [[1]]$idx
#> [1] 1
#>
#>
#> [[2]]
#> [[2]]$start_dates
#> [1] 1357448400
#>
#> [[2]]$end_dates
#> [1] 1357880399
#>
#> [[2]]$idx
#> [1] 2
#>
#>
#> [[3]]
#> [[3]]$start_dates
#> [1] 1357880400
#>
#> [[3]]$end_dates
#> [1] 1358312399
#>
#> [[3]]$idx
#> [1] 3
#>
#>
#> [[4]]
#> [[4]]$start_dates
#> [1] 1358312400
#>
#> [[4]]$end_dates
#> [1] 1358744399
#>
#> [[4]]$idx
#> [1] 4
#>
#>
#> [[5]]
#> [[5]]$start_dates
#> [1] 1358744400
#>
#> [[5]]$end_dates
#> [1] 1359176399
#>
#> [[5]]$idx
#> [1] 5
#>
#>
#> [[6]]
#> [[6]]$start_dates
#> [1] 1359176400
#>
#> [[6]]$end_dates
#> [1] 1359608399
#>
#> [[6]]$idx
#> [1] 6
# The between function is an easy way to check which dates are between that
# row's start/end boundaries
event_buckets <- map(boundary_transposed, ~between(event_dates$date, .x$start_dates, .x$end_dates))
# To further identify these logical values, assign the name of the list
# (which happens to correspond to the index) to the values that are TRUE in
# that list, then reduce it from multiple lists to just 1 vector that contains
# indexes that will match to the boundaries
event_idx <- imap(event_buckets, ~ifelse(.x == TRUE, .y, .x)) %>%
purrr::reduce(`+`)
head(event_idx)
#> [1] 1 1 1 1 1 1
tail(event_idx)
#> [1] 73 73 73 73 73 73
event_dates <- event_dates %>%
mutate(idx = event_idx)
event_dates
#> # A time tibble: 105,120 x 2
#> # Index: date
#> date idx
#> * <dttm> <int>
#> 1 2013-01-01 00:00:00 1
#> 2 2013-01-01 00:05:00 1
#> 3 2013-01-01 00:10:00 1
#> 4 2013-01-01 00:15:00 1
#> 5 2013-01-01 00:20:00 1
#> 6 2013-01-01 00:25:00 1
#> 7 2013-01-01 00:30:00 1
#> 8 2013-01-01 00:35:00 1
#> 9 2013-01-01 00:40:00 1
#> 10 2013-01-01 00:45:00 1
#> # ... with 105,110 more rows
# Finally, a simple left_join
left_join(event_dates, boundary_dates, by = "idx")
#> # A time tibble: 105,120 x 4
#> # Index: date
#> date idx start_dates end_dates
#> * <dttm> <int> <dttm> <dttm>
#> 1 2013-01-01 00:00:00 1 2013-01-01 2013-01-05 23:59:59
#> 2 2013-01-01 00:05:00 1 2013-01-01 2013-01-05 23:59:59
#> 3 2013-01-01 00:10:00 1 2013-01-01 2013-01-05 23:59:59
#> 4 2013-01-01 00:15:00 1 2013-01-01 2013-01-05 23:59:59
#> 5 2013-01-01 00:20:00 1 2013-01-01 2013-01-05 23:59:59
#> 6 2013-01-01 00:25:00 1 2013-01-01 2013-01-05 23:59:59
#> 7 2013-01-01 00:30:00 1 2013-01-01 2013-01-05 23:59:59
#> 8 2013-01-01 00:35:00 1 2013-01-01 2013-01-05 23:59:59
#> 9 2013-01-01 00:40:00 1 2013-01-01 2013-01-05 23:59:59
#> 10 2013-01-01 00:45:00 1 2013-01-01 2013-01-05 23:59:59
#> # ... with 105,110 more rows
################################################################################
# You can even wrap this up into a nice function
conditional_date_join <- function(x, date_col, start_dates, end_dates) {
boundary_dates <- tibble(start_dates, end_dates)
x_idx <- boundary_dates %>%
transpose() %>%
map(~between(x[[date_col]], .x$start_dates, .x$end_dates)) %>%
imap(~ifelse(.x == TRUE, .y, .x)) %>%
reduce(`+`)
boundary_dates <- boundary_dates %>%
mutate(idx = 1:nrow(boundary_dates))
x %>%
mutate(idx = x_idx) %>%
left_join(boundary_dates, by = "idx") %>%
select(-idx)
}
conditional_date_join(event_dates, "date",
boundary_dates$start_dates,
boundary_dates$end_dates)
#> # A time tibble: 105,120 x 3
#> # Index: date
#> date start_dates end_dates
#> * <dttm> <dttm> <dttm>
#> 1 2013-01-01 00:00:00 2013-01-01 2013-01-05 23:59:59
#> 2 2013-01-01 00:05:00 2013-01-01 2013-01-05 23:59:59
#> 3 2013-01-01 00:10:00 2013-01-01 2013-01-05 23:59:59
#> 4 2013-01-01 00:15:00 2013-01-01 2013-01-05 23:59:59
#> 5 2013-01-01 00:20:00 2013-01-01 2013-01-05 23:59:59
#> 6 2013-01-01 00:25:00 2013-01-01 2013-01-05 23:59:59
#> 7 2013-01-01 00:30:00 2013-01-01 2013-01-05 23:59:59
#> 8 2013-01-01 00:35:00 2013-01-01 2013-01-05 23:59:59
#> 9 2013-01-01 00:40:00 2013-01-01 2013-01-05 23:59:59
#> 10 2013-01-01 00:45:00 2013-01-01 2013-01-05 23:59:59
#> # ... with 105,110 more rows
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment