Skip to content

Instantly share code, notes, and snippets.

@brshallo
Last active February 24, 2019 23:21
Show Gist options
  • Save brshallo/ee03dca5fee6330afa4e430fb5fe9719 to your computer and use it in GitHub Desktop.
Save brshallo/ee03dca5fee6330afa4e430fb5fe9719 to your computer and use it in GitHub Desktop.
Visualize distribution of mean of target by two variables within a hierarchy, segmented by a third variable.
library(tidyverse)
library(glue)
library(ggbeeswarm)

flights <- nycflights13::flights %>%
  filter(arr_delay > 0) %>%
  mutate(arr_delay_log = log(arr_delay),
         quarter = as.factor(1 + month %/% 4))

group_summarise_mns <- function(df, y, ...) {
  y_expr <- enquo(y)
  
  df %>%
    group_by_at(vars(...)) %>%
    summarise(
      mean = mean(!!y_expr, na.rm = TRUE),
      n = n(),
      sd = sd(!!y_expr, na.rm = TRUE)
    ) %>%
    ungroup()
}

flights %>%
  group_summarise_mns(arr_delay, carrier, month, day)
#> # A tibble: 4,672 x 6
#>    carrier month   day  mean     n    sd
#>    <chr>   <int> <int> <dbl> <int> <dbl>
#>  1 9E          1     1  39.2    13  66.3
#>  2 9E          1     2  29.9    33  34.6
#>  3 9E          1     3  35.6    30  54.8
#>  4 9E          1     4  28.1    25  33.9
#>  5 9E          1     5  33.2    20  57.1
#>  6 9E          1     6  23.2    22  32.9
#>  7 9E          1     7  25.4     5  25.4
#>  8 9E          1     8  26.2    12  48.2
#>  9 9E          1     9  18.6    13  18.2
#> 10 9E          1    10  19.4     7  18.8
#> # ... with 4,662 more rows


box_box_box_plot <- function(df, y, x, coarse, gran){
  
  y_expr <- enquo(y)
  x_expr <- enquo(x)
  coarse_expr <- enquo(coarse)
  gran_expr <- enquo(gran)
  
  # create y-label with padding
  y_level <- c("Box and whiskers: ",
               "Coarse means: ",
               "Coarse means:")
  segmenting_exprs <- c(quo_name(y_expr),
                        quo_name(gran_expr),
                        quo_name(coarse_expr))
  y_axis <- paste0(y_level, segmenting_exprs) %>%
    str_pad(width = max(str_length(.)), side = "right") %>%
    str_c(collapse = "\n")
  
  df <- df %>%
    mutate(!!x_expr := fct_reorder(!!x_expr,!!y_expr))
  
  df %>%
    group_summarise_mns(!!y_expr, !!x_expr, !!gran_expr, !!coarse_expr) %>%
    ggplot(aes(x = !!x_expr, y = mean)) +
    geom_boxplot(aes(x = !!x_expr, y = !!y_expr),
                 outlier.shape = NA,
                 data = df) +
    geom_quasirandom(aes(colour = !!coarse_expr, size = n),
                                 alpha = 0.1,
                                 shape = 15) +
    geom_point(
      aes(colour = !!coarse_expr),
      data = group_summarise_mns(df, !!y_expr, !!x_expr, !!coarse_expr),
      size = 3,
      shape = 15
    ) +
    scale_x_discrete(labels = abbreviate) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
          text = element_text(family = "mono")) +
    labs(# subtitle = paste0("Coarse level: ", quo_name(coarse_expr), "\nGranular level: ", quo_name(gran_expr)),
      size = glue("n in {quo_name(gran_expr)}"),
      y = y_axis)
}

flights %>% 
  box_box_box_plot(arr_delay_log, carrier, quarter, day)+
  ggtitle("Quarter 2 typically has the worst delays")

Created on 2019-02-24 by the reprex package (v0.2.1)

@brshallo
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment