"Please can anyone suggest some dplyr code that can turn my sample data_in
into my sample data_out
?"
data_in <- tibble::tribble(
~patient, ~delivery_date, ~obs_point, ~obs_date,
"A", "2024-01-28", "antenatal", "2024-01-01",
"A", "2024-01-28", "at_delivery", "2024-01-28",
"B", "2024-01-28", "at_delivery", "2024-01-27",
"B", "2024-01-28", "at_delivery", "2024-01-28",
"C", "2024-01-28", "at_delivery", "2024-01-27",
"C", "2024-01-28", "at_delivery", "2024-01-28",
"C", "2024-01-28", "at_delivery", "2024-01-29",
"D", "2024-01-28", "antenatal", "2024-01-28",
"D", "2024-01-28", "at_delivery", "2024-01-29",
"E", "2024-01-28", "antenatal", "2024-01-01",
"E", "2024-01-28", "at_delivery", "2024-01-27",
"E", "2024-01-28", "at_delivery", "2024-01-29",
"F", "2024-01-28", "antenatal", "2024-01-27",
"G", "2024-01-28", "at_delivery", "2024-01-29",
"G", "2024-01-28", "at_delivery", "2024-01-30",
"H", "2024-01-28", "antenatal", "2024-01-20"
)
data_out <- tibble::tribble(
~patient, ~delivery_date, ~obs_point, ~obs_date,
"A", "2024-01-28", "antenatal", "2024-01-01",
"A", "2024-01-28", "at_delivery", "2024-01-28",
"B", "2024-01-28", "antenatal", "2024-01-27",
"B", "2024-01-28", "at_delivery", "2024-01-28",
"C", "2024-01-28", "antenatal", "2024-01-27",
"C", "2024-01-28", "at_delivery", "2024-01-28",
"D", "2024-01-28", "at_delivery", "2024-01-28",
"E", "2024-01-28", "antenatal", "2024-01-01",
"E", "2024-01-28", "antenatal", "2024-01-27",
"E", "2024-01-28", "at_delivery", "2024-01-29",
"F", "2024-01-28", "at_delivery", "2024-01-27",
"G", "2024-01-28", "at_delivery", "2024-01-29",
"H", "2024-01-28", "antenatal", "2024-01-20"
)
The rules are:
- I want a single "at_delivery" row (exactly 1) per patient if possible
- Any obs can be "at_delivery" if it's within +/- 3 days of the delivery date, but obs on the delivery date itself are prioritised
- If there's a choice only between a delivery obs before and after the delivery date I'll take the later one
- "antenatal" obs can be retained if before the delivery_date, but an obs labelled "antenatal" that is on the delivery date should be relabelled as "at delivery"
- If we've got a delivery obs then any other "at delivery" obs before the delivery date can be relabelled as antenatal
- Rows will be distinct (no duplicates) - I know I can do this with dplyr::distinct() at the end
library(dplyr, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
# https://community.rstudio.com/t/42875
# https://stackoverflow.com/q/64972688/5168907
my_coalesce <- function(.data, ..., res = "result") {
nms <- rlang::data_syms(colnames(dplyr::select(.data, ...)))
.data |>
dplyr::mutate({{res}} := dplyr::coalesce(!!!nms), .keep = "unused")
}
data_in |>
mutate(across(ends_with("date"), as.Date)) |>
filter(if_any("obs_date", \(x) x - days(3L) <= .data[["delivery_date"]])) |>
mutate(across("obs_point", \(x) if_else(.data[["obs_date"]] >= .data[["delivery_date"]], "at_delivery", x))) |>
mutate(
del_date1 = if_else(.data[["obs_date"]] == .data[["delivery_date"]], .data[["obs_date"]], NA_Date_)
) |>
mutate(across("del_date1", \(x) if_else(suppressWarnings(max(x, na.rm = TRUE)) > 0, unique(.data[["delivery_date"]]), NA_Date_)), .by = "patient") |>
mutate(
del_date2 = suppressWarnings(min(.data[["obs_date"]][.data[["obs_date"]] > .data[["delivery_date"]]])),
.by = "patient"
) |>
mutate(across("del_date2", \(x) if_else(x == Inf, NA_Date_, x))) |>
mutate(
del_date3 = suppressWarnings(max(.data[["obs_date"]][.data[["obs_date"]] < .data[["delivery_date"]] & .data[["obs_date"]] + days(3) >= .data[["delivery_date"]]])),
.by = "patient"
) |>
mutate(across("del_date3", \(x) if_else(x == -Inf, NA_Date_, x))) |>
my_coalesce(starts_with("del_date"), res = "at_delivery") |>
mutate(across("obs_point", \(x) case_when(
.data[["obs_date"]] == .data[["delivery_date"]] ~ "at_delivery",
.data[["obs_date"]] == .data[["at_delivery"]] ~ "at_delivery",
.data[["obs_date"]] < .data[["at_delivery"]] ~ "antenatal",
.default = x
))) |>
filter(
if_any("obs_point", \(x) x != "at_delivery") |
if_any("obs_date", \(x) x == .data[["at_delivery"]])
) |>
select(!"at_delivery")
#> # A tibble: 13 × 4
#> patient delivery_date obs_point obs_date
#> <chr> <date> <chr> <date>
#> 1 A 2024-01-28 antenatal 2024-01-01
#> 2 A 2024-01-28 at_delivery 2024-01-28
#> 3 B 2024-01-28 antenatal 2024-01-27
#> 4 B 2024-01-28 at_delivery 2024-01-28
#> 5 C 2024-01-28 antenatal 2024-01-27
#> 6 C 2024-01-28 at_delivery 2024-01-28
#> 7 D 2024-01-28 at_delivery 2024-01-28
#> 8 E 2024-01-28 antenatal 2024-01-01
#> 9 E 2024-01-28 antenatal 2024-01-27
#> 10 E 2024-01-28 at_delivery 2024-01-29
#> 11 F 2024-01-28 at_delivery 2024-01-27
#> 12 G 2024-01-28 at_delivery 2024-01-29
#> 13 H 2024-01-28 antenatal 2024-01-20
Created on 2024-02-29 with reprex v2.1.0