Skip to content

Instantly share code, notes, and snippets.

@mpettis
Last active August 28, 2019 19:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mpettis/2d83e0b456e6f47220c9b5afd081a7bf to your computer and use it in GitHub Desktop.
Save mpettis/2d83e0b456e6f47220c9b5afd081a7bf to your computer and use it in GitHub Desktop.
run-length-encoding-to-dataframe
################################################################################
# Make a function that does what I want
# Take a dataframe, identify the target column with runs, ID the index column,
# which is the column that I want to capture the first and last values of for
# runs.
#' rle_as_df
#'
#' Turn run-length encoding into a dataframe so it can be used with ggplot/geom_area.
#'
#' @param .df Data frame with data to rle
#' @param .idx_col The column that has the indexes you want the start/ends of, like array indexes or dates
#' @param .tgt_col The column that has the data to rle
#'
#' @return tibble
#'
#' @examples
rle_as_df <- function(.df, .idx_col, .tgt_col) {
library(tidyverse)
._v_idx <- .df %>% pluck(.idx_col)
._v_tgt <- .df %>% pluck(.tgt_col)
._rle <- rle(._v_tgt)
._v_idx_end <- cumsum(._rle$lengths)
._v_idx_start <- lag(._v_idx_end)[-1] %>%
c(0, .) %>%
`+`(1)
tibble(
start=._v_idx[._v_idx_start],
end=._v_idx[._v_idx_end],
tgt=._rle$values,
n_rle = ._rle$lengths) %>%
set_names(c(sprintf("start_%s", .idx_col), sprintf("end_%s", .idx_col), .tgt_col, "n_rle"))}
################################################################################
# Test it
library(tidyverse)
library(lubridate)
# Samples to rle
set.seed(0)
df__ <- tibble(date=ymd("2019-01-01") + 0:99) %>%
mutate(t1 = sample(c(T, F), size=n(), replace=TRUE, prob=c(3, 1))) %>%
mutate(t2 = seq(n())) %>%
mutate(t3 = ifelse(t2 %in% c(1, 2, 11, 33, 34, 35, 100), NA_integer_, t1))
df__
rle_as_df(df__, "date", "t1")
rle_as_df(df__, "date", "t2")
rle_as_df(df__, "date", "t3")
# Try some plots
# Original data
ggplot(df__, aes(date, t3)) +
geom_line()
# Now try by just painting false values with a rectangle, source rectangle generated by the function above.
# https://stackoverflow.com/questions/32543176/highlight-areas-within-certain-x-range-in-ggplot2
df__rect <- rle_as_df(df__, "date", "t3") %>%
filter(is.na(t3) | (t3 == 0))
ggplot(df__, aes(date, t3)) +
geom_line() +
geom_rect(data=df__rect, inherit.aes = FALSE, aes(xmin=start_date, xmax=end_date, ymin=-Inf, ymax=Inf),
color="transparent", fill="orange", alpha=0.3) +
geom_vline(xintercept = df__rect %>% filter(start_date == end_date) %>% pluck("start_date"), color="red")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment