Last active
November 2, 2017 01:16
-
-
Save DavisVaughan/691e1d9c127705b8a0d7994a9a866234 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
suppressPackageStartupMessages(library(tibbletime)) | |
suppressPackageStartupMessages(library(dplyr)) | |
suppressPackageStartupMessages(library(purrr)) | |
# Create a tibble of event dates, ~100k | |
# 2013-01-01 to the end of the year by 5 minutes | |
event_dates <- create_series(~2013, 5~M, force_class = "POSIXct") | |
event_dates | |
#> # A time tibble: 105,120 x 1 | |
#> # Index: date | |
#> date | |
#> <dttm> | |
#> 1 2013-01-01 00:00:00 | |
#> 2 2013-01-01 00:05:00 | |
#> 3 2013-01-01 00:10:00 | |
#> 4 2013-01-01 00:15:00 | |
#> 5 2013-01-01 00:20:00 | |
#> 6 2013-01-01 00:25:00 | |
#> 7 2013-01-01 00:30:00 | |
#> 8 2013-01-01 00:35:00 | |
#> 9 2013-01-01 00:40:00 | |
#> 10 2013-01-01 00:45:00 | |
#> # ... with 105,110 more rows | |
# Create tibbles of start and end dates | |
# Something like a boundary every 5 days should work fine as an example | |
boundary_dates <- tibble( | |
start_dates = create_series(time_formula = ~2013, | |
period = 5~d, | |
force_class = "POSIXct", | |
as_date_vector = TRUE), | |
end_dates = create_series(time_formula = 2013-01-05 + 23:59:59 ~ 2014-01-01 + 00:00:00, | |
period = 5~d, | |
include_end = FALSE, | |
force_class = "POSIXct", | |
as_date_vector = TRUE) | |
) | |
# Let's also add an index, later we will use this to join by | |
boundary_dates <- boundary_dates %>% | |
mutate(idx = 1:nrow(boundary_dates)) | |
boundary_dates | |
#> # A tibble: 73 x 3 | |
#> start_dates end_dates idx | |
#> <dttm> <dttm> <int> | |
#> 1 2013-01-01 2013-01-05 23:59:59 1 | |
#> 2 2013-01-06 2013-01-10 23:59:59 2 | |
#> 3 2013-01-11 2013-01-15 23:59:59 3 | |
#> 4 2013-01-16 2013-01-20 23:59:59 4 | |
#> 5 2013-01-21 2013-01-25 23:59:59 5 | |
#> 6 2013-01-26 2013-01-30 23:59:59 6 | |
#> 7 2013-01-31 2013-02-04 23:59:59 7 | |
#> 8 2013-02-05 2013-02-09 23:59:59 8 | |
#> 9 2013-02-10 2013-02-14 23:59:59 9 | |
#> 10 2013-02-15 2013-02-19 23:59:59 10 | |
#> # ... with 63 more rows | |
# The goal is, for each start/end boundary row combo, find the event_dates that | |
# fall in that boundary. | |
# To do this, we need to iterate over each row. We can do that with transpose | |
boundary_transposed <- transpose(boundary_dates) | |
head(boundary_transposed) | |
#> [[1]] | |
#> [[1]]$start_dates | |
#> [1] 1357016400 | |
#> | |
#> [[1]]$end_dates | |
#> [1] 1357448399 | |
#> | |
#> [[1]]$idx | |
#> [1] 1 | |
#> | |
#> | |
#> [[2]] | |
#> [[2]]$start_dates | |
#> [1] 1357448400 | |
#> | |
#> [[2]]$end_dates | |
#> [1] 1357880399 | |
#> | |
#> [[2]]$idx | |
#> [1] 2 | |
#> | |
#> | |
#> [[3]] | |
#> [[3]]$start_dates | |
#> [1] 1357880400 | |
#> | |
#> [[3]]$end_dates | |
#> [1] 1358312399 | |
#> | |
#> [[3]]$idx | |
#> [1] 3 | |
#> | |
#> | |
#> [[4]] | |
#> [[4]]$start_dates | |
#> [1] 1358312400 | |
#> | |
#> [[4]]$end_dates | |
#> [1] 1358744399 | |
#> | |
#> [[4]]$idx | |
#> [1] 4 | |
#> | |
#> | |
#> [[5]] | |
#> [[5]]$start_dates | |
#> [1] 1358744400 | |
#> | |
#> [[5]]$end_dates | |
#> [1] 1359176399 | |
#> | |
#> [[5]]$idx | |
#> [1] 5 | |
#> | |
#> | |
#> [[6]] | |
#> [[6]]$start_dates | |
#> [1] 1359176400 | |
#> | |
#> [[6]]$end_dates | |
#> [1] 1359608399 | |
#> | |
#> [[6]]$idx | |
#> [1] 6 | |
# The between function is an easy way to check which dates are between that | |
# row's start/end boundaries | |
event_buckets <- map(boundary_transposed, ~between(event_dates$date, .x$start_dates, .x$end_dates)) | |
# To further identify these logical values, assign the name of the list | |
# (which happens to correspond to the index) to the values that are TRUE in | |
# that list, then reduce it from multiple lists to just 1 vector that contains | |
# indexes that will match to the boundaries | |
event_idx <- imap(event_buckets, ~ifelse(.x == TRUE, .y, .x)) %>% | |
purrr::reduce(`+`) | |
head(event_idx) | |
#> [1] 1 1 1 1 1 1 | |
tail(event_idx) | |
#> [1] 73 73 73 73 73 73 | |
event_dates <- event_dates %>% | |
mutate(idx = event_idx) | |
event_dates | |
#> # A time tibble: 105,120 x 2 | |
#> # Index: date | |
#> date idx | |
#> * <dttm> <int> | |
#> 1 2013-01-01 00:00:00 1 | |
#> 2 2013-01-01 00:05:00 1 | |
#> 3 2013-01-01 00:10:00 1 | |
#> 4 2013-01-01 00:15:00 1 | |
#> 5 2013-01-01 00:20:00 1 | |
#> 6 2013-01-01 00:25:00 1 | |
#> 7 2013-01-01 00:30:00 1 | |
#> 8 2013-01-01 00:35:00 1 | |
#> 9 2013-01-01 00:40:00 1 | |
#> 10 2013-01-01 00:45:00 1 | |
#> # ... with 105,110 more rows | |
# Finally, a simple left_join | |
left_join(event_dates, boundary_dates, by = "idx") | |
#> # A time tibble: 105,120 x 4 | |
#> # Index: date | |
#> date idx start_dates end_dates | |
#> * <dttm> <int> <dttm> <dttm> | |
#> 1 2013-01-01 00:00:00 1 2013-01-01 2013-01-05 23:59:59 | |
#> 2 2013-01-01 00:05:00 1 2013-01-01 2013-01-05 23:59:59 | |
#> 3 2013-01-01 00:10:00 1 2013-01-01 2013-01-05 23:59:59 | |
#> 4 2013-01-01 00:15:00 1 2013-01-01 2013-01-05 23:59:59 | |
#> 5 2013-01-01 00:20:00 1 2013-01-01 2013-01-05 23:59:59 | |
#> 6 2013-01-01 00:25:00 1 2013-01-01 2013-01-05 23:59:59 | |
#> 7 2013-01-01 00:30:00 1 2013-01-01 2013-01-05 23:59:59 | |
#> 8 2013-01-01 00:35:00 1 2013-01-01 2013-01-05 23:59:59 | |
#> 9 2013-01-01 00:40:00 1 2013-01-01 2013-01-05 23:59:59 | |
#> 10 2013-01-01 00:45:00 1 2013-01-01 2013-01-05 23:59:59 | |
#> # ... with 105,110 more rows | |
################################################################################ | |
# You can even wrap this up into a nice function | |
conditional_date_join <- function(x, date_col, start_dates, end_dates) { | |
boundary_dates <- tibble(start_dates, end_dates) | |
x_idx <- boundary_dates %>% | |
transpose() %>% | |
map(~between(x[[date_col]], .x$start_dates, .x$end_dates)) %>% | |
imap(~ifelse(.x == TRUE, .y, .x)) %>% | |
reduce(`+`) | |
boundary_dates <- boundary_dates %>% | |
mutate(idx = 1:nrow(boundary_dates)) | |
x %>% | |
mutate(idx = x_idx) %>% | |
left_join(boundary_dates, by = "idx") %>% | |
select(-idx) | |
} | |
conditional_date_join(event_dates, "date", | |
boundary_dates$start_dates, | |
boundary_dates$end_dates) | |
#> # A time tibble: 105,120 x 3 | |
#> # Index: date | |
#> date start_dates end_dates | |
#> * <dttm> <dttm> <dttm> | |
#> 1 2013-01-01 00:00:00 2013-01-01 2013-01-05 23:59:59 | |
#> 2 2013-01-01 00:05:00 2013-01-01 2013-01-05 23:59:59 | |
#> 3 2013-01-01 00:10:00 2013-01-01 2013-01-05 23:59:59 | |
#> 4 2013-01-01 00:15:00 2013-01-01 2013-01-05 23:59:59 | |
#> 5 2013-01-01 00:20:00 2013-01-01 2013-01-05 23:59:59 | |
#> 6 2013-01-01 00:25:00 2013-01-01 2013-01-05 23:59:59 | |
#> 7 2013-01-01 00:30:00 2013-01-01 2013-01-05 23:59:59 | |
#> 8 2013-01-01 00:35:00 2013-01-01 2013-01-05 23:59:59 | |
#> 9 2013-01-01 00:40:00 2013-01-01 2013-01-05 23:59:59 | |
#> 10 2013-01-01 00:45:00 2013-01-01 2013-01-05 23:59:59 | |
#> # ... with 105,110 more rows |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment