Skip to content

Instantly share code, notes, and snippets.

@francisbarton
Created February 29, 2024 19:39
Show Gist options
  • Save francisbarton/8dd2799e75673c249eb0a2e1842958ea to your computer and use it in GitHub Desktop.
Save francisbarton/8dd2799e75673c249eb0a2e1842958ea to your computer and use it in GitHub Desktop.
Horrible dplyr data wrangling

"Please can anyone suggest some dplyr code that can turn my sample data_in into my sample data_out?"

data_in <- tibble::tribble(
  ~patient, ~delivery_date, ~obs_point, ~obs_date,
  "A", "2024-01-28", "antenatal", "2024-01-01",
  "A", "2024-01-28", "at_delivery",  "2024-01-28",
  "B", "2024-01-28", "at_delivery",  "2024-01-27",
  "B", "2024-01-28", "at_delivery",  "2024-01-28",
  "C", "2024-01-28", "at_delivery",  "2024-01-27",
  "C", "2024-01-28", "at_delivery", "2024-01-28",
  "C", "2024-01-28", "at_delivery",  "2024-01-29",
  "D", "2024-01-28", "antenatal",  "2024-01-28",
  "D", "2024-01-28", "at_delivery",  "2024-01-29",
  "E", "2024-01-28", "antenatal",  "2024-01-01",
  "E", "2024-01-28", "at_delivery",  "2024-01-27",
  "E", "2024-01-28", "at_delivery",  "2024-01-29",
  "F", "2024-01-28", "antenatal",  "2024-01-27",
  "G", "2024-01-28", "at_delivery",  "2024-01-29",
  "G", "2024-01-28", "at_delivery",  "2024-01-30",
  "H", "2024-01-28", "antenatal",  "2024-01-20"
)
data_out <- tibble::tribble(
  ~patient, ~delivery_date, ~obs_point, ~obs_date,
  "A", "2024-01-28", "antenatal", "2024-01-01",
  "A", "2024-01-28", "at_delivery",  "2024-01-28",
  "B", "2024-01-28", "antenatal",  "2024-01-27",
  "B", "2024-01-28", "at_delivery",  "2024-01-28",
  "C", "2024-01-28", "antenatal", "2024-01-27",
  "C", "2024-01-28", "at_delivery", "2024-01-28",
  "D", "2024-01-28", "at_delivery",  "2024-01-28",
  "E", "2024-01-28", "antenatal",  "2024-01-01",
  "E", "2024-01-28", "antenatal",  "2024-01-27",
  "E", "2024-01-28", "at_delivery",  "2024-01-29",
  "F", "2024-01-28", "at_delivery",  "2024-01-27",
  "G", "2024-01-28", "at_delivery",  "2024-01-29",
  "H", "2024-01-28", "antenatal",  "2024-01-20"
)

The rules are:

  • I want a single "at_delivery" row (exactly 1) per patient if possible
  • Any obs can be "at_delivery" if it's within +/- 3 days of the delivery date, but obs on the delivery date itself are prioritised
  • If there's a choice only between a delivery obs before and after the delivery date I'll take the later one
  • "antenatal" obs can be retained if before the delivery_date, but an obs labelled "antenatal" that is on the delivery date should be relabelled as "at delivery"
  • If we've got a delivery obs then any other "at delivery" obs before the delivery date can be relabelled as antenatal
  • Rows will be distinct (no duplicates) - I know I can do this with dplyr::distinct() at the end
library(dplyr, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
# https://community.rstudio.com/t/42875
# https://stackoverflow.com/q/64972688/5168907
my_coalesce <- function(.data, ..., res = "result") {
  nms <- rlang::data_syms(colnames(dplyr::select(.data, ...)))
  .data |>
    dplyr::mutate({{res}} := dplyr::coalesce(!!!nms), .keep = "unused")
}
data_in |>
  mutate(across(ends_with("date"), as.Date)) |>
  filter(if_any("obs_date", \(x) x - days(3L) <= .data[["delivery_date"]])) |>
  mutate(across("obs_point", \(x) if_else(.data[["obs_date"]] >= .data[["delivery_date"]], "at_delivery", x))) |>
  mutate(
    del_date1 = if_else(.data[["obs_date"]] == .data[["delivery_date"]], .data[["obs_date"]], NA_Date_)
  ) |>
  mutate(across("del_date1", \(x) if_else(suppressWarnings(max(x, na.rm = TRUE)) > 0, unique(.data[["delivery_date"]]), NA_Date_)), .by = "patient") |>
  mutate(
    del_date2 = suppressWarnings(min(.data[["obs_date"]][.data[["obs_date"]] > .data[["delivery_date"]]])),
    .by = "patient"
  ) |>
  mutate(across("del_date2", \(x) if_else(x == Inf, NA_Date_, x))) |>
  mutate(
    del_date3 = suppressWarnings(max(.data[["obs_date"]][.data[["obs_date"]] < .data[["delivery_date"]] & .data[["obs_date"]] + days(3) >= .data[["delivery_date"]]])),
    .by = "patient"
  ) |>
  mutate(across("del_date3", \(x) if_else(x == -Inf, NA_Date_, x))) |>
  my_coalesce(starts_with("del_date"), res = "at_delivery") |>
  mutate(across("obs_point", \(x) case_when(
    .data[["obs_date"]] == .data[["delivery_date"]] ~ "at_delivery",
    .data[["obs_date"]] == .data[["at_delivery"]] ~ "at_delivery",
    .data[["obs_date"]] < .data[["at_delivery"]] ~ "antenatal",
    .default = x
  ))) |>
  filter(
    if_any("obs_point", \(x) x != "at_delivery") |
    if_any("obs_date", \(x) x == .data[["at_delivery"]])
  ) |>
  select(!"at_delivery")

#> # A tibble: 13 × 4
#>    patient delivery_date obs_point   obs_date  
#>    <chr>   <date>        <chr>       <date>    
#>  1 A       2024-01-28    antenatal   2024-01-01
#>  2 A       2024-01-28    at_delivery 2024-01-28
#>  3 B       2024-01-28    antenatal   2024-01-27
#>  4 B       2024-01-28    at_delivery 2024-01-28
#>  5 C       2024-01-28    antenatal   2024-01-27
#>  6 C       2024-01-28    at_delivery 2024-01-28
#>  7 D       2024-01-28    at_delivery 2024-01-28
#>  8 E       2024-01-28    antenatal   2024-01-01
#>  9 E       2024-01-28    antenatal   2024-01-27
#> 10 E       2024-01-28    at_delivery 2024-01-29
#> 11 F       2024-01-28    at_delivery 2024-01-27
#> 12 G       2024-01-28    at_delivery 2024-01-29
#> 13 H       2024-01-28    antenatal   2024-01-20

Created on 2024-02-29 with reprex v2.1.0

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment