DavisVaughan/conditional-date-join.r

## conditional-date-join.r
suppressPackageStartupMessages(library(tibbletime))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(purrr))

# Create a tibble of event dates, ~100k
# 2013-01-01 to the end of the year by 5 minutes
event_dates <- create_series(~2013, 5~M, force_class = "POSIXct")

event_dates
#> # A time tibble: 105,120 x 1
#> # Index: date
#>                   date
#>                 <dttm>
#>  1 2013-01-01 00:00:00
#>  2 2013-01-01 00:05:00
#>  3 2013-01-01 00:10:00
#>  4 2013-01-01 00:15:00
#>  5 2013-01-01 00:20:00
#>  6 2013-01-01 00:25:00
#>  7 2013-01-01 00:30:00
#>  8 2013-01-01 00:35:00
#>  9 2013-01-01 00:40:00
#> 10 2013-01-01 00:45:00
#> # ... with 105,110 more rows

# Create tibbles of start and end dates
# Something like a boundary every 5 days should work fine as an example
boundary_dates <- tibble(
  start_dates = create_series(time_formula = ~2013,
                              period = 5~d,
                              force_class = "POSIXct",
                              as_date_vector = TRUE),
  end_dates   = create_series(time_formula = 2013-01-05 + 23:59:59 ~ 2014-01-01 + 00:00:00,
                              period = 5~d,
                              include_end = FALSE,
                              force_class = "POSIXct",
                              as_date_vector = TRUE)
)

# Let's also add an index, later we will use this to join by
boundary_dates <- boundary_dates %>%
  mutate(idx = 1:nrow(boundary_dates))

boundary_dates
#> # A tibble: 73 x 3
#>    start_dates           end_dates   idx
#>         <dttm>              <dttm> <int>
#>  1  2013-01-01 2013-01-05 23:59:59     1
#>  2  2013-01-06 2013-01-10 23:59:59     2
#>  3  2013-01-11 2013-01-15 23:59:59     3
#>  4  2013-01-16 2013-01-20 23:59:59     4
#>  5  2013-01-21 2013-01-25 23:59:59     5
#>  6  2013-01-26 2013-01-30 23:59:59     6
#>  7  2013-01-31 2013-02-04 23:59:59     7
#>  8  2013-02-05 2013-02-09 23:59:59     8
#>  9  2013-02-10 2013-02-14 23:59:59     9
#> 10  2013-02-15 2013-02-19 23:59:59    10
#> # ... with 63 more rows


# The goal is, for each start/end boundary row combo, find the event_dates that
# fall in that boundary.
# To do this, we need to iterate over each row. We can do that with transpose
boundary_transposed <- transpose(boundary_dates)

head(boundary_transposed)
#> [[1]]
#> [[1]]$start_dates
#> [1] 1357016400
#>
#> [[1]]$end_dates
#> [1] 1357448399
#>
#> [[1]]$idx
#> [1] 1
#>
#>
#> [[2]]
#> [[2]]$start_dates
#> [1] 1357448400
#>
#> [[2]]$end_dates
#> [1] 1357880399
#>
#> [[2]]$idx
#> [1] 2
#>
#>
#> [[3]]
#> [[3]]$start_dates
#> [1] 1357880400
#>
#> [[3]]$end_dates
#> [1] 1358312399
#>
#> [[3]]$idx
#> [1] 3
#>
#>
#> [[4]]
#> [[4]]$start_dates
#> [1] 1358312400
#>
#> [[4]]$end_dates
#> [1] 1358744399
#>
#> [[4]]$idx
#> [1] 4
#>
#>
#> [[5]]
#> [[5]]$start_dates
#> [1] 1358744400
#>
#> [[5]]$end_dates
#> [1] 1359176399
#>
#> [[5]]$idx
#> [1] 5
#>
#>
#> [[6]]
#> [[6]]$start_dates
#> [1] 1359176400
#>
#> [[6]]$end_dates
#> [1] 1359608399
#>
#> [[6]]$idx
#> [1] 6

# The between function is an easy way to check which dates are between that
# row's start/end boundaries
event_buckets <- map(boundary_transposed, ~between(event_dates$date, .x$start_dates, .x$end_dates))

# To further identify these logical values, assign the name of the list
# (which happens to correspond to the index) to the values that are TRUE in
# that list, then reduce it from multiple lists to just 1 vector that contains
# indexes that will match to the boundaries
event_idx <- imap(event_buckets, ~ifelse(.x == TRUE, .y, .x)) %>%
  purrr::reduce(`+`)

head(event_idx)
#> [1] 1 1 1 1 1 1
tail(event_idx)
#> [1] 73 73 73 73 73 73

event_dates <- event_dates %>%
  mutate(idx = event_idx)

event_dates
#> # A time tibble: 105,120 x 2
#> # Index: date
#>                   date   idx
#>  *              <dttm> <int>
#>  1 2013-01-01 00:00:00     1
#>  2 2013-01-01 00:05:00     1
#>  3 2013-01-01 00:10:00     1
#>  4 2013-01-01 00:15:00     1
#>  5 2013-01-01 00:20:00     1
#>  6 2013-01-01 00:25:00     1
#>  7 2013-01-01 00:30:00     1
#>  8 2013-01-01 00:35:00     1
#>  9 2013-01-01 00:40:00     1
#> 10 2013-01-01 00:45:00     1
#> # ... with 105,110 more rows

# Finally, a simple left_join
left_join(event_dates, boundary_dates, by = "idx")
#> # A time tibble: 105,120 x 4
#> # Index: date
#>                   date   idx start_dates           end_dates
#>  *              <dttm> <int>      <dttm>              <dttm>
#>  1 2013-01-01 00:00:00     1  2013-01-01 2013-01-05 23:59:59
#>  2 2013-01-01 00:05:00     1  2013-01-01 2013-01-05 23:59:59
#>  3 2013-01-01 00:10:00     1  2013-01-01 2013-01-05 23:59:59
#>  4 2013-01-01 00:15:00     1  2013-01-01 2013-01-05 23:59:59
#>  5 2013-01-01 00:20:00     1  2013-01-01 2013-01-05 23:59:59
#>  6 2013-01-01 00:25:00     1  2013-01-01 2013-01-05 23:59:59
#>  7 2013-01-01 00:30:00     1  2013-01-01 2013-01-05 23:59:59
#>  8 2013-01-01 00:35:00     1  2013-01-01 2013-01-05 23:59:59
#>  9 2013-01-01 00:40:00     1  2013-01-01 2013-01-05 23:59:59
#> 10 2013-01-01 00:45:00     1  2013-01-01 2013-01-05 23:59:59
#> # ... with 105,110 more rows


################################################################################

# You can even wrap this up into a nice function

conditional_date_join <- function(x, date_col, start_dates, end_dates) {

  boundary_dates <- tibble(start_dates, end_dates)

  x_idx <- boundary_dates %>%
    transpose() %>%
    map(~between(x[[date_col]], .x$start_dates, .x$end_dates)) %>%
    imap(~ifelse(.x == TRUE, .y, .x)) %>%
    reduce(`+`)

  boundary_dates <- boundary_dates %>%
    mutate(idx = 1:nrow(boundary_dates))

  x %>%
    mutate(idx = x_idx) %>%
    left_join(boundary_dates, by = "idx") %>%
    select(-idx)
}

conditional_date_join(event_dates, "date",
                      boundary_dates$start_dates,
                      boundary_dates$end_dates)
#> # A time tibble: 105,120 x 3
#> # Index: date
#>                   date start_dates           end_dates
#>  *              <dttm>      <dttm>              <dttm>
#>  1 2013-01-01 00:00:00  2013-01-01 2013-01-05 23:59:59
#>  2 2013-01-01 00:05:00  2013-01-01 2013-01-05 23:59:59
#>  3 2013-01-01 00:10:00  2013-01-01 2013-01-05 23:59:59
#>  4 2013-01-01 00:15:00  2013-01-01 2013-01-05 23:59:59
#>  5 2013-01-01 00:20:00  2013-01-01 2013-01-05 23:59:59
#>  6 2013-01-01 00:25:00  2013-01-01 2013-01-05 23:59:59
#>  7 2013-01-01 00:30:00  2013-01-01 2013-01-05 23:59:59
#>  8 2013-01-01 00:35:00  2013-01-01 2013-01-05 23:59:59
#>  9 2013-01-01 00:40:00  2013-01-01 2013-01-05 23:59:59
#> 10 2013-01-01 00:45:00  2013-01-01 2013-01-05 23:59:59
#> # ... with 105,110 more rows
	suppressPackageStartupMessages(library(tibbletime))
	suppressPackageStartupMessages(library(dplyr))
	suppressPackageStartupMessages(library(purrr))

	# Create a tibble of event dates, ~100k
	# 2013-01-01 to the end of the year by 5 minutes
	event_dates <- create_series(~2013, 5~M, force_class = "POSIXct")

	event_dates
	#> # A time tibble: 105,120 x 1
	#> # Index: date
	#> date
	#> <dttm>
	#> 1 2013-01-01 00:00:00
	#> 2 2013-01-01 00:05:00
	#> 3 2013-01-01 00:10:00
	#> 4 2013-01-01 00:15:00
	#> 5 2013-01-01 00:20:00
	#> 6 2013-01-01 00:25:00
	#> 7 2013-01-01 00:30:00
	#> 8 2013-01-01 00:35:00
	#> 9 2013-01-01 00:40:00
	#> 10 2013-01-01 00:45:00
	#> # ... with 105,110 more rows

	# Create tibbles of start and end dates
	# Something like a boundary every 5 days should work fine as an example
	boundary_dates <- tibble(
	start_dates = create_series(time_formula = ~2013,
	period = 5~d,
	force_class = "POSIXct",
	as_date_vector = TRUE),
	end_dates = create_series(time_formula = 2013-01-05 + 23:59:59 ~ 2014-01-01 + 00:00:00,
	period = 5~d,
	include_end = FALSE,
	force_class = "POSIXct",
	as_date_vector = TRUE)
	)

	# Let's also add an index, later we will use this to join by
	boundary_dates <- boundary_dates %>%
	mutate(idx = 1:nrow(boundary_dates))

	boundary_dates
	#> # A tibble: 73 x 3
	#> start_dates end_dates idx
	#> <dttm> <dttm> <int>
	#> 1 2013-01-01 2013-01-05 23:59:59 1
	#> 2 2013-01-06 2013-01-10 23:59:59 2
	#> 3 2013-01-11 2013-01-15 23:59:59 3
	#> 4 2013-01-16 2013-01-20 23:59:59 4
	#> 5 2013-01-21 2013-01-25 23:59:59 5
	#> 6 2013-01-26 2013-01-30 23:59:59 6
	#> 7 2013-01-31 2013-02-04 23:59:59 7
	#> 8 2013-02-05 2013-02-09 23:59:59 8
	#> 9 2013-02-10 2013-02-14 23:59:59 9
	#> 10 2013-02-15 2013-02-19 23:59:59 10
	#> # ... with 63 more rows


	# The goal is, for each start/end boundary row combo, find the event_dates that
	# fall in that boundary.
	# To do this, we need to iterate over each row. We can do that with transpose
	boundary_transposed <- transpose(boundary_dates)

	head(boundary_transposed)
	#> [[1]]
	#> [[1]]$start_dates
	#> [1] 1357016400
	#>
	#> [[1]]$end_dates
	#> [1] 1357448399
	#>
	#> [[1]]$idx
	#> [1] 1
	#>
	#>
	#> [[2]]
	#> [[2]]$start_dates
	#> [1] 1357448400
	#>
	#> [[2]]$end_dates
	#> [1] 1357880399
	#>
	#> [[2]]$idx
	#> [1] 2
	#>
	#>
	#> [[3]]
	#> [[3]]$start_dates
	#> [1] 1357880400
	#>
	#> [[3]]$end_dates
	#> [1] 1358312399
	#>
	#> [[3]]$idx
	#> [1] 3
	#>
	#>
	#> [[4]]
	#> [[4]]$start_dates
	#> [1] 1358312400
	#>
	#> [[4]]$end_dates
	#> [1] 1358744399
	#>
	#> [[4]]$idx
	#> [1] 4
	#>
	#>
	#> [[5]]
	#> [[5]]$start_dates
	#> [1] 1358744400
	#>
	#> [[5]]$end_dates
	#> [1] 1359176399
	#>
	#> [[5]]$idx
	#> [1] 5
	#>
	#>
	#> [[6]]
	#> [[6]]$start_dates
	#> [1] 1359176400
	#>
	#> [[6]]$end_dates
	#> [1] 1359608399
	#>
	#> [[6]]$idx
	#> [1] 6

	# The between function is an easy way to check which dates are between that
	# row's start/end boundaries
	event_buckets <- map(boundary_transposed, ~between(event_dates$date, .x$start_dates, .x$end_dates))

	# To further identify these logical values, assign the name of the list
	# (which happens to correspond to the index) to the values that are TRUE in
	# that list, then reduce it from multiple lists to just 1 vector that contains
	# indexes that will match to the boundaries
	event_idx <- imap(event_buckets, ~ifelse(.x == TRUE, .y, .x)) %>%
	purrr::reduce(`+`)

	head(event_idx)
	#> [1] 1 1 1 1 1 1
	tail(event_idx)
	#> [1] 73 73 73 73 73 73

	event_dates <- event_dates %>%
	mutate(idx = event_idx)

	event_dates
	#> # A time tibble: 105,120 x 2
	#> # Index: date
	#> date idx
	#> * <dttm> <int>
	#> 1 2013-01-01 00:00:00 1
	#> 2 2013-01-01 00:05:00 1
	#> 3 2013-01-01 00:10:00 1
	#> 4 2013-01-01 00:15:00 1
	#> 5 2013-01-01 00:20:00 1
	#> 6 2013-01-01 00:25:00 1
	#> 7 2013-01-01 00:30:00 1
	#> 8 2013-01-01 00:35:00 1
	#> 9 2013-01-01 00:40:00 1
	#> 10 2013-01-01 00:45:00 1
	#> # ... with 105,110 more rows

	# Finally, a simple left_join
	left_join(event_dates, boundary_dates, by = "idx")
	#> # A time tibble: 105,120 x 4
	#> # Index: date
	#> date idx start_dates end_dates
	#> * <dttm> <int> <dttm> <dttm>
	#> 1 2013-01-01 00:00:00 1 2013-01-01 2013-01-05 23:59:59
	#> 2 2013-01-01 00:05:00 1 2013-01-01 2013-01-05 23:59:59
	#> 3 2013-01-01 00:10:00 1 2013-01-01 2013-01-05 23:59:59
	#> 4 2013-01-01 00:15:00 1 2013-01-01 2013-01-05 23:59:59
	#> 5 2013-01-01 00:20:00 1 2013-01-01 2013-01-05 23:59:59
	#> 6 2013-01-01 00:25:00 1 2013-01-01 2013-01-05 23:59:59
	#> 7 2013-01-01 00:30:00 1 2013-01-01 2013-01-05 23:59:59
	#> 8 2013-01-01 00:35:00 1 2013-01-01 2013-01-05 23:59:59
	#> 9 2013-01-01 00:40:00 1 2013-01-01 2013-01-05 23:59:59
	#> 10 2013-01-01 00:45:00 1 2013-01-01 2013-01-05 23:59:59
	#> # ... with 105,110 more rows


	################################################################################

	# You can even wrap this up into a nice function

	conditional_date_join <- function(x, date_col, start_dates, end_dates) {

	boundary_dates <- tibble(start_dates, end_dates)

	x_idx <- boundary_dates %>%
	transpose() %>%
	map(~between(x[[date_col]], .x$start_dates, .x$end_dates)) %>%
	imap(~ifelse(.x == TRUE, .y, .x)) %>%
	reduce(`+`)

	boundary_dates <- boundary_dates %>%
	mutate(idx = 1:nrow(boundary_dates))

	x %>%
	mutate(idx = x_idx) %>%
	left_join(boundary_dates, by = "idx") %>%
	select(-idx)
	}

	conditional_date_join(event_dates, "date",
	boundary_dates$start_dates,
	boundary_dates$end_dates)
	#> # A time tibble: 105,120 x 3
	#> # Index: date
	#> date start_dates end_dates
	#> * <dttm> <dttm> <dttm>
	#> 1 2013-01-01 00:00:00 2013-01-01 2013-01-05 23:59:59
	#> 2 2013-01-01 00:05:00 2013-01-01 2013-01-05 23:59:59
	#> 3 2013-01-01 00:10:00 2013-01-01 2013-01-05 23:59:59
	#> 4 2013-01-01 00:15:00 2013-01-01 2013-01-05 23:59:59
	#> 5 2013-01-01 00:20:00 2013-01-01 2013-01-05 23:59:59
	#> 6 2013-01-01 00:25:00 2013-01-01 2013-01-05 23:59:59
	#> 7 2013-01-01 00:30:00 2013-01-01 2013-01-05 23:59:59
	#> 8 2013-01-01 00:35:00 2013-01-01 2013-01-05 23:59:59
	#> 9 2013-01-01 00:40:00 2013-01-01 2013-01-05 23:59:59
	#> 10 2013-01-01 00:45:00 2013-01-01 2013-01-05 23:59:59
	#> # ... with 105,110 more rows