mbjoseph/minimal-wildfire-brms.R

## minimal-wildfire-brms.R
library(tidyverse)
library(sf)
library(here)
library(brms)
library(lubridate)
library(reshape2)

# Get ecoregion data ------------------------------------------------------
download.file("ftp://newftp.epa.gov/EPADataCommons/ORD/Ecoregions/us/us_eco_l4.zip",
              destfile = "ecoregions.zip")
unzip("ecoregions.zip")
ecoregions <- st_read("us_eco_l4_no_st.shp") %>%
  st_make_valid() %>%
  # summarize to level 3 regions (assuming we don't want to go down to level 4)
  group_by(L3_KEY, L2_KEY, L1_KEY) %>%
  summarize()

ecoregions <- ecoregions %>%
  ungroup %>%
  mutate(eco_area_sq_m = st_area(ecoregions),
         eco_area_sq_km = as.numeric(eco_area_sq_m) / 1000000)

ecoregions %>%
  ggplot() +
  geom_sf()


# Get MTBS data -----------------------------------------------------------
download.file("https://edcintl.cr.usgs.gov/downloads/sciweb1/shared/MTBS_Fire/data/composite_data/fod_pt_shapefile/mtbs_fod_pts_data.zip",
              destfile = "mtbs.zip")
unzip("mtbs.zip")
mtbs <- st_read("mtbs_FODpoints_DD.shp") %>%
  st_transform(st_crs(ecoregions)) %>%
  filter(Incid_Type == "Wildfire",
         BurnBndAc > 1e3) %>%
  mutate(year = year(Ig_Date),
         month = month(Ig_Date))

mtbs %>%
  ggplot() +
  geom_sf()


# Find the ecoregions for each MTBS event ---------------------------------
mtbs_eco <- st_intersection(mtbs, ecoregions)

mtbs_eco  %>%
  filter(grepl("GREAT PLAINS", L1_KEY))%>%
  ggplot() +
  geom_sf(aes(color = L2_KEY))


# Generate clean count and size data sets  ----------------------

nonzero_counts <- mtbs_eco %>%
  as_tibble %>%
  count(year, month, L3_KEY)

ecoregion_keys <- ecoregions %>%
  as_tibble %>%
  select(ends_with("KEY"), eco_area_sq_km)

# start with a small subset of the data to reduce model run times
focal_l1_region <- "GREAT PLAINS"
max_year_train_set <- 2010


all_counts <- expand.grid(year = 1984:2018,
                          month = 1:12,
                          L3_KEY = levels(ecoregions$L3_KEY)) %>%
  as_tibble %>%
  filter(!(year == 1984 & month == 1)) %>% # Jan 1984 is not in MTBS
  left_join(nonzero_counts) %>%
  mutate(n = ifelse(is.na(n), 0, n)) %>%
  left_join(ecoregion_keys)


train_counts <- all_counts %>%
  filter(
    grepl(focal_l1_region, L1_KEY), # start by focusing on one ecoregion
    year < max_year_train_set # start with a subset of years
    )

train_sizes <- mtbs_eco %>%
  as_tibble %>%
  filter(
    grepl(focal_l1_region, L1_KEY), # start by focusing on one ecoregion
    year < max_year_train_set # start with a subset of years
  )

test_sizes <- mtbs_eco %>%
  as_tibble %>%
  filter(grepl(focal_l1_region, L1_KEY)) %>%
  anti_join(train_sizes)

# Fit a count model -------------------------------------------------------

# the smoother with bs = "cc" is a cyclic spline smoother (essentially an
# easy way to account for seasonality, though in a real model it would be
# better to include climate data instead)
count_model <- brm(n ~ s(month, bs = "cc") + (1 | L2_KEY) + (1 | L3_KEY) +
                     offset(log(eco_area_sq_km)),
  data = train_counts,
  family = zero_inflated_negbinomial(),
  cores = 4
  )

summary(count_model)

plot(conditional_effects(count_model), ask = FALSE)


# Fit a fire size model ---------------------------------------------------

size_model <- brm(BurnBndAc ~ s(month, bs = "cc") + (1 | L2_KEY) + (1 | L3_KEY),
  data = train_sizes,
  family = lognormal(),
  cores = 4)

summary(size_model)

plot(conditional_effects(size_model), ask = FALSE)


# Combine outputs from count and size model to infer total burn area ------

test_counts <- anti_join(all_counts, train_counts) %>%
  filter(L3_KEY %in% train_counts$L3_KEY) %>%
  mutate(test_row_index = 1:n())


# generate a set of count predictions for each row in the test data
count_predictions <- predict(count_model,
                             summary = FALSE,
                             newdata = test_counts,
                             allow_new_levels = TRUE) %>%
  reshape2::melt(varnames = c("iter", "test_row_index")) %>%
  as_tibble %>%
  filter(value > 0) # we only need to predict sizes for predicted events


## For each iteration, simulate total burned area values -------------------
# (note that this could be parallelized to speed things up, but a for loop
#  may be easier to debug)

max_iter <- 1000
size_predictions <- list()
pb <- txtProgressBar(max = max_iter, style = 3)
for (i in 1:max_iter) {
  size_pred_df <- count_predictions %>%
    filter(iter == i) %>%
    left_join(test_counts, by = "test_row_index")

  # simulate size values
  size_matrix <- predict(size_model,
                        newdata = size_pred_df,
                        summary = FALSE,
                        allow_new_levels = TRUE)

  # choose one posterior draw at random
  random_draw <- sample(1:nrow(size_matrix), size = 1)

  # compute a prediction for total burned area
  size_pred_df$predicted_size <- size_matrix[random_draw, ]
  size_predictions[[i]] <- size_pred_df %>%
    select(iter, test_row_index, predicted_size)
  setTxtProgressBar(pb, i)
}
close(pb)


# Generate a clean dataset of predictions ---------------------------------
posterior_preds <- bind_rows(size_predictions) %>%
  left_join(test_counts) %>%
  group_by(iter, L3_KEY, year, month) %>%
  summarize(n_events_pred = length(predicted_size),
            total_burned_area_pred = sum(predicted_size),
            maximum_fire_size_pred = max(predicted_size)) %>%
  ungroup %>%
  # fill implicit zeros (occurs when no events are predicted)
  complete(iter, L3_KEY, year, month,
           fill = list(
             n_events_pred = 0,
             total_burned_area_pred = 0,
             maximum_fire_size_pred = 0
           )) %>%
  filter(!(year == 1984 & month == 1),  # Jan 1984 is not in MTBS
         L3_KEY %in% train_counts$L3_KEY) %>%
  mutate(first_day_of_month = as.Date(paste(year,
                                            sprintf("%02d", month),
                                            "01",
                                            sep = "-")))


# Visualize predictions ---------------------------------------------------

# show one line per posterior iteration
posterior_preds %>%
  ggplot(aes(first_day_of_month, total_burned_area_pred)) +
  geom_line(aes(group = iter), alpha = .2) +
  facet_wrap(~L3_KEY, ncol = 2) +
  xlab("Date") +
  ylab("Total burned area (acres)")


# show posterior median & 95 credible interval + actual withheld data
test_summary <-  test_sizes %>%
  mutate(first_day_of_month = as.Date(paste(year,
                                            sprintf("%02d", month),
                                            "01",
                                            sep = "-"))) %>%
  group_by(L3_KEY, first_day_of_month) %>%
  summarize(total_burned_area = sum(BurnBndAc)) %>%
  ungroup

# predictions shown as ribbons, withheld data shown as points
posterior_preds %>%
  group_by(first_day_of_month, L3_KEY) %>%
  mutate(
    med = median(total_burned_area_pred),
    lo = quantile(total_burned_area_pred, .025),
    hi = quantile(total_burned_area_pred, .975)
  ) %>%
  ggplot(aes(first_day_of_month)) +
  geom_ribbon(aes(ymin = lo, ymax = hi),
              color = NA, alpha = .4, fill = "darkred") +
  geom_line(aes(y = med)) +
  facet_wrap(~L3_KEY, ncol = 2) +
  xlab("Date") +
  ylab("Total burned area (acres)") +
  geom_point(data = test_summary,
             aes(y = total_burned_area)) +
  scale_y_log10()
	library(tidyverse)
	library(sf)
	library(here)
	library(brms)
	library(lubridate)
	library(reshape2)

	# Get ecoregion data ------------------------------------------------------
	download.file("ftp://newftp.epa.gov/EPADataCommons/ORD/Ecoregions/us/us_eco_l4.zip",
	destfile = "ecoregions.zip")
	unzip("ecoregions.zip")
	ecoregions <- st_read("us_eco_l4_no_st.shp") %>%
	st_make_valid() %>%
	# summarize to level 3 regions (assuming we don't want to go down to level 4)
	group_by(L3_KEY, L2_KEY, L1_KEY) %>%
	summarize()

	ecoregions <- ecoregions %>%
	ungroup %>%
	mutate(eco_area_sq_m = st_area(ecoregions),
	eco_area_sq_km = as.numeric(eco_area_sq_m) / 1000000)

	ecoregions %>%
	ggplot() +
	geom_sf()


	# Get MTBS data -----------------------------------------------------------
	download.file("https://edcintl.cr.usgs.gov/downloads/sciweb1/shared/MTBS_Fire/data/composite_data/fod_pt_shapefile/mtbs_fod_pts_data.zip",
	destfile = "mtbs.zip")
	unzip("mtbs.zip")
	mtbs <- st_read("mtbs_FODpoints_DD.shp") %>%
	st_transform(st_crs(ecoregions)) %>%
	filter(Incid_Type == "Wildfire",
	BurnBndAc > 1e3) %>%
	mutate(year = year(Ig_Date),
	month = month(Ig_Date))

	mtbs %>%
	ggplot() +
	geom_sf()


	# Find the ecoregions for each MTBS event ---------------------------------
	mtbs_eco <- st_intersection(mtbs, ecoregions)

	mtbs_eco %>%
	filter(grepl("GREAT PLAINS", L1_KEY))%>%
	ggplot() +
	geom_sf(aes(color = L2_KEY))



	# Generate clean count and size data sets ----------------------

	nonzero_counts <- mtbs_eco %>%
	as_tibble %>%
	count(year, month, L3_KEY)

	ecoregion_keys <- ecoregions %>%
	as_tibble %>%
	select(ends_with("KEY"), eco_area_sq_km)

	# start with a small subset of the data to reduce model run times
	focal_l1_region <- "GREAT PLAINS"
	max_year_train_set <- 2010


	all_counts <- expand.grid(year = 1984:2018,
	month = 1:12,
	L3_KEY = levels(ecoregions$L3_KEY)) %>%
	as_tibble %>%
	filter(!(year == 1984 & month == 1)) %>% # Jan 1984 is not in MTBS
	left_join(nonzero_counts) %>%
	mutate(n = ifelse(is.na(n), 0, n)) %>%
	left_join(ecoregion_keys)


	train_counts <- all_counts %>%
	filter(
	grepl(focal_l1_region, L1_KEY), # start by focusing on one ecoregion
	year < max_year_train_set # start with a subset of years
	)

	train_sizes <- mtbs_eco %>%
	as_tibble %>%
	filter(
	grepl(focal_l1_region, L1_KEY), # start by focusing on one ecoregion
	year < max_year_train_set # start with a subset of years
	)

	test_sizes <- mtbs_eco %>%
	as_tibble %>%
	filter(grepl(focal_l1_region, L1_KEY)) %>%
	anti_join(train_sizes)

	# Fit a count model -------------------------------------------------------

	# the smoother with bs = "cc" is a cyclic spline smoother (essentially an
	# easy way to account for seasonality, though in a real model it would be
	# better to include climate data instead)
	count_model <- brm(n ~ s(month, bs = "cc") + (1 \| L2_KEY) + (1 \| L3_KEY) +
	offset(log(eco_area_sq_km)),
	data = train_counts,
	family = zero_inflated_negbinomial(),
	cores = 4
	)

	summary(count_model)

	plot(conditional_effects(count_model), ask = FALSE)


	# Fit a fire size model ---------------------------------------------------

	size_model <- brm(BurnBndAc ~ s(month, bs = "cc") + (1 \| L2_KEY) + (1 \| L3_KEY),
	data = train_sizes,
	family = lognormal(),
	cores = 4)

	summary(size_model)

	plot(conditional_effects(size_model), ask = FALSE)



	# Combine outputs from count and size model to infer total burn area ------

	test_counts <- anti_join(all_counts, train_counts) %>%
	filter(L3_KEY %in% train_counts$L3_KEY) %>%
	mutate(test_row_index = 1:n())


	# generate a set of count predictions for each row in the test data
	count_predictions <- predict(count_model,
	summary = FALSE,
	newdata = test_counts,
	allow_new_levels = TRUE) %>%
	reshape2::melt(varnames = c("iter", "test_row_index")) %>%
	as_tibble %>%
	filter(value > 0) # we only need to predict sizes for predicted events


	## For each iteration, simulate total burned area values -------------------
	# (note that this could be parallelized to speed things up, but a for loop
	# may be easier to debug)

	max_iter <- 1000
	size_predictions <- list()
	pb <- txtProgressBar(max = max_iter, style = 3)
	for (i in 1:max_iter) {
	size_pred_df <- count_predictions %>%
	filter(iter == i) %>%
	left_join(test_counts, by = "test_row_index")

	# simulate size values
	size_matrix <- predict(size_model,
	newdata = size_pred_df,
	summary = FALSE,
	allow_new_levels = TRUE)

	# choose one posterior draw at random
	random_draw <- sample(1:nrow(size_matrix), size = 1)

	# compute a prediction for total burned area
	size_pred_df$predicted_size <- size_matrix[random_draw, ]
	size_predictions[[i]] <- size_pred_df %>%
	select(iter, test_row_index, predicted_size)
	setTxtProgressBar(pb, i)
	}
	close(pb)




	# Generate a clean dataset of predictions ---------------------------------
	posterior_preds <- bind_rows(size_predictions) %>%
	left_join(test_counts) %>%
	group_by(iter, L3_KEY, year, month) %>%
	summarize(n_events_pred = length(predicted_size),
	total_burned_area_pred = sum(predicted_size),
	maximum_fire_size_pred = max(predicted_size)) %>%
	ungroup %>%
	# fill implicit zeros (occurs when no events are predicted)
	complete(iter, L3_KEY, year, month,
	fill = list(
	n_events_pred = 0,
	total_burned_area_pred = 0,
	maximum_fire_size_pred = 0
	)) %>%
	filter(!(year == 1984 & month == 1), # Jan 1984 is not in MTBS
	L3_KEY %in% train_counts$L3_KEY) %>%
	mutate(first_day_of_month = as.Date(paste(year,
	sprintf("%02d", month),
	"01",
	sep = "-")))



	# Visualize predictions ---------------------------------------------------

	# show one line per posterior iteration
	posterior_preds %>%
	ggplot(aes(first_day_of_month, total_burned_area_pred)) +
	geom_line(aes(group = iter), alpha = .2) +
	facet_wrap(~L3_KEY, ncol = 2) +
	xlab("Date") +
	ylab("Total burned area (acres)")


	# show posterior median & 95 credible interval + actual withheld data
	test_summary <- test_sizes %>%
	mutate(first_day_of_month = as.Date(paste(year,
	sprintf("%02d", month),
	"01",
	sep = "-"))) %>%
	group_by(L3_KEY, first_day_of_month) %>%
	summarize(total_burned_area = sum(BurnBndAc)) %>%
	ungroup

	# predictions shown as ribbons, withheld data shown as points
	posterior_preds %>%
	group_by(first_day_of_month, L3_KEY) %>%
	mutate(
	med = median(total_burned_area_pred),
	lo = quantile(total_burned_area_pred, .025),
	hi = quantile(total_burned_area_pred, .975)
	) %>%
	ggplot(aes(first_day_of_month)) +
	geom_ribbon(aes(ymin = lo, ymax = hi),
	color = NA, alpha = .4, fill = "darkred") +
	geom_line(aes(y = med)) +
	facet_wrap(~L3_KEY, ncol = 2) +
	xlab("Date") +
	ylab("Total burned area (acres)") +
	geom_point(data = test_summary,
	aes(y = total_burned_area)) +
	scale_y_log10()