dholstius/BY2011-summarytree.R

## BY2011-summarytree.R
#'
#' Summary Tree Tool
#'
#' Discover which categories --- or groups of categories --- emit the largest
#' share(s) of a particular pollutant.
#'
#' Version history:
#'
#' - Created 2015-12-01 by dholstius
#' - Last updated 2019-10-23 by dholstius
#'
#' References:
#'
#' - Kenny Shirley and Howard Karloff (2019). `summarytrees`: Compute and Visualize
#'   Summary Trees. R package version 0.1.
#'
#' - Kenny Shirley (2015). Visualizing Maximum Entropy Summary Trees Using R and d3.
#'   Presented at the Joint Statistical Meeting (JSM), Seattle, WA.
#'   http://www.kennyshirley.com/jsm2015/
#'

library(inventory)
library(BY2011)
library(summarytrees)

#'
#' The hierarchy that you pick here constrains the solution space.
#'
#' Any "summary tree" solution has to be a tree that can be formed by iterative
#' consolidation, starting from the leaves of this "maximal tree".
#'
#' This will make more sense when you get to the end!
#'
summarytree_hierarchy <-
  BY2011_category_hierarchy

#'
#' The data should be compatible with the hierarchy.
#'
summarytree_emission_data <-
  BY2011_annual_emission_data

#'
#' The "weightiest" leaf nodes (single categories) will depend on which
#' pollutant you pick, and also on the year.
#'
#' Edit the code below where it says `NOx` and/or `CY(2011)`. Try using
#' `CO2`, `PM2.5`, or `ROG` instead of `NOx`.
#'
summarytree_scope <-
  tibble(
    year = CY(2011),
    pol_abbr = "NOx",
    cat_h0 = "Area sources")

#'
#' You shouldn't need to change the code below, unless:
#'
#' - you want `weight` to vary non-linearly, or with something other than emission (`ems_qty`); or
#' - you want to pre-aggreate to some granularity other than `cat_id`; or
#' - you want the tree to be based on something other than categories (e.g. SIC codes, or pollutants).
#'
node_weight_data <-
  summarytree_emission_data %>%
  with_hierarchy(
    summarytree_hierarchy) %>%
  semi_join(
    summarytree_scope) %>%
  group_by(
    cat_id) %>%
  summarise(
    weight = total(ems_qty, digits = 4)) %>%
  filter(
    weight > 0)

#'
#' Construct the input data for the search algorithm.
#'
summarytree_input_data <- local({

  summarytree_structure <-
    summarytree_hierarchy %>%
    semi_join(
      node_weight_data,
      by = c("cat_id")) %>%
    extract_tree()

  weighted_adjacency_data <-
    summarytree_structure %>%
    as.data.frame(
      form = "adjacency_list") %>%
    left_join(
      mutate_at(node_weight_data, vars(cat_id), as.character),
      by = c("label" = "cat_id")) %>%
    ensure(
      !all_true(is.na(.$weight)))

  summarytree_data <-
    weighted_adjacency_data %>%
    replace_na(
      list(parent = 0, weight = 0)) %>%
    transmute(
      node   = as.numeric(node),
      parent = as.numeric(parent),
      weight = as.numeric(weight),
      label  = str_replace(label, "^([[:digit:]]+)$", "#\\1"),
      level  = as.numeric(depth))

})

#'
#' For each `k` in `{1, 2, ..., MAX_TREE_SIZE}`, pre-compute the optimal (most
#' balanced) summary tree of size `k`.
#'
#' - The size `k` is just the number of nodes in a given summary tree.
#' - The tool will let you thumb through these pre-computed trees in order.
#'
summarytree_solutions <- local({

  #'
  #' Increase `MAX_TREE_SIZE` if you find that you cannot explore deeply enough.
  #'
  MAX_TREE_SIZE <- 150

  #'
  #' If you're a perfectionist, and have time to spare, you can use `optimal()`
  #' instead of `greedy()` here.
  #'
  #' If you do, you'll have to pick a value for `epsilon`. This is the
  #' nonnegative upper bound of the additive error when using the approximate
  #' algorithm. Each `k`-node summary tree returned by the approximate algorithm
  #' will have entropy within `epsilon` of the maximal entropy of a `k`-node
  #' summary tree.
  #'
  #' See `?summarytrees::optimal` and `?summarytrees::greedy` for details.
  #'
  summarytree_solutions <-
    with(
      summarytree_input_data,
      summarytrees::greedy(
        node, parent, weight, label,
        #epsilon = 1.0,
        K = min(
          MAX_TREE_SIZE,
          n_distinct(summarytree_data$node))))

})

#'
#' Prepare JSON for the interactive visualization.
#'
#' You can change some things here, like `units`, `node.width`, `node.height`,
#' `print.weights`, and `color.level`. There doesn't seem to be a great way to
#' get the labels to wrap --- an injected `</br>` just gets escaped --- but if
#' you discover a way, please send me an email, and I'll update it here!
#'
summarytree_json <-
  with(
    summarytree_solutions,
    summarytrees::prepare.vis(
      tree.list     = summary.trees,
      labels        = str_wrap(data[, "label"], width = 10),
      tree          = tree,
      legend.width  = 100,
      node.width    = 100,
      node.height   = 18,
      units         = str_c("tons/yr ", summarytree_scope$pol_abbr),
      print.weights = FALSE,
      legend.color  = "lightsteelblue",
      color.level   = 1))

#'
#' Actually show the visualization.
#'
summarytrees::draw.vis(
  json.object   = summarytree_json,
  out.dir       = tempfile(),
  open.browser  = TRUE)

#'
#' Pause here.
#'
readline("Press any key to stop and clean up ...")

#'
#' Clean up. This is required if you want to re-run with new parameters;
#' otherwise, you'll see the old tree and wonder why nothing changed.
#'
for (srv_id in servr::daemon_list()) {
  servr::daemon_stop(srv_id)
}
	#'
	#' Summary Tree Tool
	#'
	#' Discover which categories --- or groups of categories --- emit the largest
	#' share(s) of a particular pollutant.
	#'
	#' Version history:
	#'
	#' - Created 2015-12-01 by dholstius
	#' - Last updated 2019-10-23 by dholstius
	#'
	#' References:
	#'
	#' - Kenny Shirley and Howard Karloff (2019). `summarytrees`: Compute and Visualize
	#' Summary Trees. R package version 0.1.
	#'
	#' - Kenny Shirley (2015). Visualizing Maximum Entropy Summary Trees Using R and d3.
	#' Presented at the Joint Statistical Meeting (JSM), Seattle, WA.
	#' http://www.kennyshirley.com/jsm2015/
	#'

	library(inventory)
	library(BY2011)
	library(summarytrees)

	#'
	#' The hierarchy that you pick here constrains the solution space.
	#'
	#' Any "summary tree" solution has to be a tree that can be formed by iterative
	#' consolidation, starting from the leaves of this "maximal tree".
	#'
	#' This will make more sense when you get to the end!
	#'
	summarytree_hierarchy <-
	BY2011_category_hierarchy

	#'
	#' The data should be compatible with the hierarchy.
	#'
	summarytree_emission_data <-
	BY2011_annual_emission_data

	#'
	#' The "weightiest" leaf nodes (single categories) will depend on which
	#' pollutant you pick, and also on the year.
	#'
	#' Edit the code below where it says `NOx` and/or `CY(2011)`. Try using
	#' `CO2`, `PM2.5`, or `ROG` instead of `NOx`.
	#'
	summarytree_scope <-
	tibble(
	year = CY(2011),
	pol_abbr = "NOx",
	cat_h0 = "Area sources")

	#'
	#' You shouldn't need to change the code below, unless:
	#'
	#' - you want `weight` to vary non-linearly, or with something other than emission (`ems_qty`); or
	#' - you want to pre-aggreate to some granularity other than `cat_id`; or
	#' - you want the tree to be based on something other than categories (e.g. SIC codes, or pollutants).
	#'
	node_weight_data <-
	summarytree_emission_data %>%
	with_hierarchy(
	summarytree_hierarchy) %>%
	semi_join(
	summarytree_scope) %>%
	group_by(
	cat_id) %>%
	summarise(
	weight = total(ems_qty, digits = 4)) %>%
	filter(
	weight > 0)

	#'
	#' Construct the input data for the search algorithm.
	#'
	summarytree_input_data <- local({

	summarytree_structure <-
	summarytree_hierarchy %>%
	semi_join(
	node_weight_data,
	by = c("cat_id")) %>%
	extract_tree()

	weighted_adjacency_data <-
	summarytree_structure %>%
	as.data.frame(
	form = "adjacency_list") %>%
	left_join(
	mutate_at(node_weight_data, vars(cat_id), as.character),
	by = c("label" = "cat_id")) %>%
	ensure(
	!all_true(is.na(.$weight)))

	summarytree_data <-
	weighted_adjacency_data %>%
	replace_na(
	list(parent = 0, weight = 0)) %>%
	transmute(
	node = as.numeric(node),
	parent = as.numeric(parent),
	weight = as.numeric(weight),
	label = str_replace(label, "^([[:digit:]]+)$", "#\\1"),
	level = as.numeric(depth))

	})

	#'
	#' For each `k` in `{1, 2, ..., MAX_TREE_SIZE}`, pre-compute the optimal (most
	#' balanced) summary tree of size `k`.
	#'
	#' - The size `k` is just the number of nodes in a given summary tree.
	#' - The tool will let you thumb through these pre-computed trees in order.
	#'
	summarytree_solutions <- local({

	#'
	#' Increase `MAX_TREE_SIZE` if you find that you cannot explore deeply enough.
	#'
	MAX_TREE_SIZE <- 150

	#'
	#' If you're a perfectionist, and have time to spare, you can use `optimal()`
	#' instead of `greedy()` here.
	#'
	#' If you do, you'll have to pick a value for `epsilon`. This is the
	#' nonnegative upper bound of the additive error when using the approximate
	#' algorithm. Each `k`-node summary tree returned by the approximate algorithm
	#' will have entropy within `epsilon` of the maximal entropy of a `k`-node
	#' summary tree.
	#'
	#' See `?summarytrees::optimal` and `?summarytrees::greedy` for details.
	#'
	summarytree_solutions <-
	with(
	summarytree_input_data,
	summarytrees::greedy(
	node, parent, weight, label,
	#epsilon = 1.0,
	K = min(
	MAX_TREE_SIZE,
	n_distinct(summarytree_data$node))))

	})

	#'
	#' Prepare JSON for the interactive visualization.
	#'
	#' You can change some things here, like `units`, `node.width`, `node.height`,
	#' `print.weights`, and `color.level`. There doesn't seem to be a great way to
	#' get the labels to wrap --- an injected `</br>` just gets escaped --- but if
	#' you discover a way, please send me an email, and I'll update it here!
	#'
	summarytree_json <-
	with(
	summarytree_solutions,
	summarytrees::prepare.vis(
	tree.list = summary.trees,
	labels = str_wrap(data[, "label"], width = 10),
	tree = tree,
	legend.width = 100,
	node.width = 100,
	node.height = 18,
	units = str_c("tons/yr ", summarytree_scope$pol_abbr),
	print.weights = FALSE,
	legend.color = "lightsteelblue",
	color.level = 1))

	#'
	#' Actually show the visualization.
	#'
	summarytrees::draw.vis(
	json.object = summarytree_json,
	out.dir = tempfile(),
	open.browser = TRUE)

	#'
	#' Pause here.
	#'
	readline("Press any key to stop and clean up ...")

	#'
	#' Clean up. This is required if you want to re-run with new parameters;
	#' otherwise, you'll see the old tree and wonder why nothing changed.
	#'
	for (srv_id in servr::daemon_list()) {
	servr::daemon_stop(srv_id)
	}