hanowell/how_to_measure_attrition_part_2.R

## how_to_measure_attrition_part_2.R
# Load packages ----
library(tidyverse)

# Create US-state-level headcount dataset ----
business_markets <- tibble::tribble(
  ~city_name,      ~state_name, ~mean_headcount,
  "Austin",        "TX",        8,
  "Denver",        "CO",        22,
  "Los Angeles",   "CA",        97,
  "Minneapolis",   "MN",        7,
  "New York",      "NY",        14,
  "Portland",      "OR",        80,
  "San Francisco", "CA",        145,
  "Seattle",       "WA",        198,
  "St. Paul",      "MN",        2
)

# Parameterize the true distribution of city-level attrition rates ----
attrition_mean <- 0.25
attrition_variance <- 0.04
alpha <- (attrition_mean^2)/attrition_variance
beta <- attrition_mean/attrition_variance

# Function to sample true city-level attrition rates ----
sample_true_attrition <- function(data) {
  data %>%
    dplyr::mutate(true_attrition_rate = rgamma(length(city_name), alpha, beta))
}

# Simulate terminations ----
one_iteration <- function(data, iteration) {
  data %>%
    sample_true_attrition() %>%
    dplyr::rowwise() %>%
    dplyr::mutate(
      terminations = rpois(1, true_attrition_rate * mean_headcount)
    ) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(iteration = iteration)
}
multiple_iterations <- function(data, iterations) {
  1:iterations %>%
    purrr::map(., ~one_iteration(data, .)) %>%
    dplyr::bind_rows()
}
simulations <- multiple_iterations(business_markets, 10000) %>%
  dplyr::mutate(
    observed_attrition_rate = terminations / mean_headcount
    , absolute_error = abs(observed_attrition_rate - true_attrition_rate)
    , absolute_p_error = absolute_error / true_attrition_rate
  )

# Summarize Type M errors ----
error_summary <- simulations %>%
  dplyr::group_by(city_name, state_name, mean_headcount) %>%
  dplyr::summarize(mape = mean(absolute_p_error)
                   , pr_zero = mean(terminations == 0)
                   , pr_all = mean(terminations == mean_headcount)
                   , pr_greater = mean(terminations > mean_headcount)) %>%
  dplyr::ungroup()

# Summarize probabilities of extreme outcomes ----
pr_extreme <- error_summary %>%
  dplyr::select(city_name
                , starts_with("pr_")) %>%
  tidyr::gather(pr_type, pr, -city_name) %>%
  dplyr::left_join(business_markets) %>%
  dplyr::mutate(pr_type = case_when(
    pr_type == "pr_zero" ~ "% none"
    , pr_type == "pr_all" ~ "% all"
    , pr_type == "pr_greater" ~ "% above headcount"
  )) %>%
  dplyr::mutate(pr_type = factor(pr_type, levels = c("% none"
                                                     , "% all"
                                                     , "% above headcount")))

# Plot headcount by market ----
ggplot2::ggplot(data = business_markets) +
  ggplot2::aes(x = reorder(city_name, mean_headcount)
               , y = mean_headcount) +
  ggplot2::ylim(0, 220) +
  ggplot2::geom_bar(stat = "identity", width = 0.01) +
  ggplot2::geom_text(aes(label = mean_headcount), hjust = -1) +
  ggplot2::geom_point() +
  ggplot2::coord_flip() +
  ggplot2::theme_minimal() +
  ggplot2::theme(panel.grid = element_blank()) +
  ggplot2::labs(title = "Mean annual headcount by business market"
                , subtitle = "(for a totally fake company)"
                , x = NULL
                , y = NULL)
ggsave("./part_2/headcount.png"
       , device = png()
       , units = "in"
       , width = 5
       , height = 3
       , dpi = 1200)
dev.off()

# Plot mean absolute percent error ----
ggplot2::ggplot(data = error_summary) +
  ggplot2::aes(x = mean_headcount, y = mape) +
  ggplot2::geom_line() +
  ggplot2::scale_y_continuous(labels = scales::percent) +
  ggplot2::labs(
    title = "Mean absolute % error (MAPE) of observed attrition rates"
    , subtitle = "Error decreases exponentially with headcount"
    , x = "Mean annual headcount"
    , y = "MAPE"
  ) +
  ggplot2::theme_minimal() +
  ggplot2::theme(panel.grid = element_blank()
                 , axis.title.y = element_text(angle = 0))
ggsave("./part_2/mape.png"
       , device = png()
       , units = "in"
       , width = 6
       , height = 3.5
       , dpi = 1200)
dev.off()

# Plot probabilities of extreme attrition observations ----
ggplot2::ggplot(data = pr_extreme) +
  ggplot2::aes(x = mean_headcount, y = pr) +
  ggplot2::geom_line() +
  ggplot2::scale_y_continuous(labels = scales::percent) +
  ggplot2::facet_wrap(. ~ pr_type) +
  ggplot2::labs(
    title = "% of simulations with extreme attrition rates"
    , subtitle = "Extreme observations less likely for higher headcount"
    , x = "Mean annual headcount"
    , y = "% of simulations"
  ) +
  ggplot2::theme(axis.title.y = element_text(angle = 0))
ggsave("./part_2/pr_extreme.png"
       , device = png()
       , units = "in"
       , width = 6
       , height = 2
       , dpi = 1200)
dev.off()
	# Load packages ----
	library(tidyverse)

	# Create US-state-level headcount dataset ----
	business_markets <- tibble::tribble(
	~city_name, ~state_name, ~mean_headcount,
	"Austin", "TX", 8,
	"Denver", "CO", 22,
	"Los Angeles", "CA", 97,
	"Minneapolis", "MN", 7,
	"New York", "NY", 14,
	"Portland", "OR", 80,
	"San Francisco", "CA", 145,
	"Seattle", "WA", 198,
	"St. Paul", "MN", 2
	)

	# Parameterize the true distribution of city-level attrition rates ----
	attrition_mean <- 0.25
	attrition_variance <- 0.04
	alpha <- (attrition_mean^2)/attrition_variance
	beta <- attrition_mean/attrition_variance

	# Function to sample true city-level attrition rates ----
	sample_true_attrition <- function(data) {
	data %>%
	dplyr::mutate(true_attrition_rate = rgamma(length(city_name), alpha, beta))
	}

	# Simulate terminations ----
	one_iteration <- function(data, iteration) {
	data %>%
	sample_true_attrition() %>%
	dplyr::rowwise() %>%
	dplyr::mutate(
	terminations = rpois(1, true_attrition_rate * mean_headcount)
	) %>%
	dplyr::ungroup() %>%
	dplyr::mutate(iteration = iteration)
	}
	multiple_iterations <- function(data, iterations) {
	1:iterations %>%
	purrr::map(., ~one_iteration(data, .)) %>%
	dplyr::bind_rows()
	}
	simulations <- multiple_iterations(business_markets, 10000) %>%
	dplyr::mutate(
	observed_attrition_rate = terminations / mean_headcount
	, absolute_error = abs(observed_attrition_rate - true_attrition_rate)
	, absolute_p_error = absolute_error / true_attrition_rate
	)

	# Summarize Type M errors ----
	error_summary <- simulations %>%
	dplyr::group_by(city_name, state_name, mean_headcount) %>%
	dplyr::summarize(mape = mean(absolute_p_error)
	, pr_zero = mean(terminations == 0)
	, pr_all = mean(terminations == mean_headcount)
	, pr_greater = mean(terminations > mean_headcount)) %>%
	dplyr::ungroup()

	# Summarize probabilities of extreme outcomes ----
	pr_extreme <- error_summary %>%
	dplyr::select(city_name
	, starts_with("pr_")) %>%
	tidyr::gather(pr_type, pr, -city_name) %>%
	dplyr::left_join(business_markets) %>%
	dplyr::mutate(pr_type = case_when(
	pr_type == "pr_zero" ~ "% none"
	, pr_type == "pr_all" ~ "% all"
	, pr_type == "pr_greater" ~ "% above headcount"
	)) %>%
	dplyr::mutate(pr_type = factor(pr_type, levels = c("% none"
	, "% all"
	, "% above headcount")))

	# Plot headcount by market ----
	ggplot2::ggplot(data = business_markets) +
	ggplot2::aes(x = reorder(city_name, mean_headcount)
	, y = mean_headcount) +
	ggplot2::ylim(0, 220) +
	ggplot2::geom_bar(stat = "identity", width = 0.01) +
	ggplot2::geom_text(aes(label = mean_headcount), hjust = -1) +
	ggplot2::geom_point() +
	ggplot2::coord_flip() +
	ggplot2::theme_minimal() +
	ggplot2::theme(panel.grid = element_blank()) +
	ggplot2::labs(title = "Mean annual headcount by business market"
	, subtitle = "(for a totally fake company)"
	, x = NULL
	, y = NULL)
	ggsave("./part_2/headcount.png"
	, device = png()
	, units = "in"
	, width = 5
	, height = 3
	, dpi = 1200)
	dev.off()

	# Plot mean absolute percent error ----
	ggplot2::ggplot(data = error_summary) +
	ggplot2::aes(x = mean_headcount, y = mape) +
	ggplot2::geom_line() +
	ggplot2::scale_y_continuous(labels = scales::percent) +
	ggplot2::labs(
	title = "Mean absolute % error (MAPE) of observed attrition rates"
	, subtitle = "Error decreases exponentially with headcount"
	, x = "Mean annual headcount"
	, y = "MAPE"
	) +
	ggplot2::theme_minimal() +
	ggplot2::theme(panel.grid = element_blank()
	, axis.title.y = element_text(angle = 0))
	ggsave("./part_2/mape.png"
	, device = png()
	, units = "in"
	, width = 6
	, height = 3.5
	, dpi = 1200)
	dev.off()

	# Plot probabilities of extreme attrition observations ----
	ggplot2::ggplot(data = pr_extreme) +
	ggplot2::aes(x = mean_headcount, y = pr) +
	ggplot2::geom_line() +
	ggplot2::scale_y_continuous(labels = scales::percent) +
	ggplot2::facet_wrap(. ~ pr_type) +
	ggplot2::labs(
	title = "% of simulations with extreme attrition rates"
	, subtitle = "Extreme observations less likely for higher headcount"
	, x = "Mean annual headcount"
	, y = "% of simulations"
	) +
	ggplot2::theme(axis.title.y = element_text(angle = 0))
	ggsave("./part_2/pr_extreme.png"
	, device = png()
	, units = "in"
	, width = 6
	, height = 2
	, dpi = 1200)
	dev.off()