brshallo/rolling-mean-conditioned-date.R

## rolling-mean-conditioned-date.R
# This example only includes a value in the rolling mean() if the close date on
# the historical dates comes after the snapshot date for row of interest


### CREATE SAMPLE DATA
library(tidyverse)
library(slider)
library(lubridate)

sample_size <- 5000
obs_per_day <- 100
day_steps <- seq(from = 1, by = 7, length.out = sample_size / obs_per_day) %>%
  map(rep, obs_per_day) %>%
  unlist()

set.seed(12)
data <- tibble(
  group = sample(LETTERS[1:4], sample_size, TRUE),
 # id = sample(LETTERS[1:10], sample_size, TRUE),
  snapshot_date = lubridate::ymd(20220101) + days(day_steps),
  close_date = snapshot_date + days(sample(1:120, sample_size, TRUE)),
  projected_close = close_date + days(sample(-30:30, sample_size, TRUE)),
  win = ifelse(projected_close < close_date, 1, 0)
  ) %>%
  arrange(snapshot_date, close_date) %>%
  filter(projected_close > snapshot_date) %>%
  group_by(group, close_date) %>%
  #mutate(count = row_number())
  # removing any obs that have closed date after final snapshot date...
  #mutate(win = ifelse(close_date >= max(snapshot_date), NA, win)) %>%
  filter(!is.na(win), group == "A")


## EXAMPLE
## Include historical observations in the rolling average if the closed date comes after the row's snap date

output <- data %>%
  group_by(group) %>%
  mutate(
    row = row_number(),
    # ctrl + f "Accessing the current index value" here for approach:
    # https://slider.r-lib.org/reference/slide_index.html
    w30_prep = slider::pslide_index(
      .l = list(win, close_date, projected_close),
      .i = snapshot_date,
      .f = list,
      #I set this to 900, ~ 3 years, to ensure this is cumulative over the entire data to compare outputs
      .before = 900,
      # below is negative so doesn't include current date of values
      .after = -1
    ),
    win30 = map2_dbl(.x = w30_prep, .y = snapshot_date,
                     .f = ~mean(.x[[1]] * ifelse( (.x[[2]] <= .y) | (.x[[3]] <= .y), 1, NA), na.rm = TRUE))
  ) %>% filter(!is.nan(win30)) %>%
  distinct(group, snapshot_date, .keep_all = T)
	# This example only includes a value in the rolling mean() if the close date on
	# the historical dates comes after the snapshot date for row of interest


	### CREATE SAMPLE DATA
	library(tidyverse)
	library(slider)
	library(lubridate)

	sample_size <- 5000
	obs_per_day <- 100
	day_steps <- seq(from = 1, by = 7, length.out = sample_size / obs_per_day) %>%
	map(rep, obs_per_day) %>%
	unlist()

	set.seed(12)
	data <- tibble(
	group = sample(LETTERS[1:4], sample_size, TRUE),
	# id = sample(LETTERS[1:10], sample_size, TRUE),
	snapshot_date = lubridate::ymd(20220101) + days(day_steps),
	close_date = snapshot_date + days(sample(1:120, sample_size, TRUE)),
	projected_close = close_date + days(sample(-30:30, sample_size, TRUE)),
	win = ifelse(projected_close < close_date, 1, 0)
	) %>%
	arrange(snapshot_date, close_date) %>%
	filter(projected_close > snapshot_date) %>%
	group_by(group, close_date) %>%
	#mutate(count = row_number())
	# removing any obs that have closed date after final snapshot date...
	#mutate(win = ifelse(close_date >= max(snapshot_date), NA, win)) %>%
	filter(!is.na(win), group == "A")



	## EXAMPLE
	## Include historical observations in the rolling average if the closed date comes after the row's snap date

	output <- data %>%
	group_by(group) %>%
	mutate(
	row = row_number(),
	# ctrl + f "Accessing the current index value" here for approach:
	# https://slider.r-lib.org/reference/slide_index.html
	w30_prep = slider::pslide_index(
	.l = list(win, close_date, projected_close),
	.i = snapshot_date,
	.f = list,
	#I set this to 900, ~ 3 years, to ensure this is cumulative over the entire data to compare outputs
	.before = 900,
	# below is negative so doesn't include current date of values
	.after = -1
	),
	win30 = map2_dbl(.x = w30_prep, .y = snapshot_date,
	.f = ~mean(.x[[1]] * ifelse( (.x[[2]] <= .y) \| (.x[[3]] <= .y), 1, NA), na.rm = TRUE))
	) %>% filter(!is.nan(win30)) %>%
	distinct(group, snapshot_date, .keep_all = T)