Skip to content

Instantly share code, notes, and snippets.

@seananderson
Created March 24, 2023 02:13
Show Gist options
  • Save seananderson/2a7f8a36789a65031c7beb1ed8d9dcfc to your computer and use it in GitHub Desktop.
Save seananderson/2a7f8a36789a65031c7beb1ed8d9dcfc to your computer and use it in GitHub Desktop.
Spatial binning vs. individual-level observations
---
title: "Individual vs. blocked spatial data modelling"
format:
html:
embed-resources: true
editor: source
execute:
echo: true
eval: true
cache: true
---
```{r}
#| cache=FALSE,
#| message=FALSE, warning=FALSE
library(dplyr)
library(ggplot2)
library(sdmTMB)
library(future)
plan(multisession)
theme_set(theme_light())
options(ggplot2.continuous.colour = "viridis")
```
# A one-off example
Compare individual predictors and individual observations vs. binned predictors with binned observations.
```{r}
set.seed(123)
N <- 1000
predictor_dat <- data.frame(
X = runif(N), Y = runif(N)
)
mesh <- make_mesh(predictor_dat, xy_cols = c("X", "Y"), cutoff = 0.1)
```
Make spatially correlated predictor `a1`:
```{r}
pred_dat <- sdmTMB_simulate(
formula = ~1,
data = predictor_dat,
mesh = mesh,
family = gaussian(),
range = 0.7,
phi = 0.02,
sigma_O = 0.4,
seed = 1928,
B = c(1)
)
predictor_dat$a1 <- pred_dat$observed
ggplot(predictor_dat, aes(X, Y, colour = a1)) +
geom_point() +
scale_color_viridis_c()
```
Simulate observations with an intercept of 0.2 and a slope of 0.4 on the predictor a1:
```{r}
sim_dat <- sdmTMB_simulate(
formula = ~ 1 + a1,
data = predictor_dat,
mesh = mesh,
family = gaussian(),
range = 0.5,
phi = 0.1,
sigma_O = 0.2,
seed = 42,
B = c(0.2, 0.4) # B0 = intercept, B1 = a1 slope
)
ggplot(sim_dat, aes(X, Y, colour = observed)) +
geom_point() +
scale_color_viridis_c()
```
Fit a spatial model:
```{r}
fit <- sdmTMB(observed ~ a1, data = sim_dat, mesh = mesh)
fit
```
Aggregate into spatial blocks:
```{r}
bins <- seq(0, 1, 0.1)
sim_dat$Xbin <- bins[findInterval(sim_dat$X, bins)]
sim_dat$Ybin <- bins[findInterval(sim_dat$Y, bins)]
ggplot(sim_dat, aes(Xbin, Ybin, colour = observed)) +
geom_point() +
scale_color_viridis_c()
d <- group_by(sim_dat, Xbin, Ybin) |>
summarise(
X = Xbin[1], Y = Ybin[1], a1_avg = mean(a1),
obs_avg = mean(observed), .groups = "drop"
)
ggplot(d, aes(X, Y, colour = a1_avg)) +
geom_point() +
scale_color_viridis_c()
ggplot(d, aes(X, Y, colour = obs_avg)) +
geom_point() +
scale_color_viridis_c()
```
Fit a spatial and non-spatial (`lm()`) model to the blocked data:
```{r}
mesh_avg <- make_mesh(d, c("X", "Y"), cutoff = 0.1)
fit_avg <- sdmTMB(obs_avg ~ a1_avg, data = d, mesh = mesh_avg)
fit_avg
fit_no_spatial <- lm(obs_avg ~ a1_avg, data = d)
fit_no_spatial
broom::tidy(fit_no_spatial)
tidy(fit)
tidy(fit_avg)
```
# A repeated example to look at estimation performance
Do it many times and look at relative error (RE), CI coverage, and median absolute relative error (accuracy) on the 'a1' coefficient estimate:
```{r}
run_repeated_test <- function(i) {
set.seed(i)
N <- 1000
predictor_dat <- data.frame(
X = runif(N), Y = runif(N)
)
mesh <- make_mesh(predictor_dat, xy_cols = c("X", "Y"), cutoff = 0.1)
pred_dat <- sdmTMB_simulate(
formula = ~1,
data = predictor_dat,
mesh = mesh,
family = gaussian(),
# range = 0.1,
range = 0.6,
# phi = 0.2,
phi = 0.05,
# sigma_O = 0.1,
sigma_O = 0.4,
seed = i * 20121,
B = c(1)
)
predictor_dat$a1 <- pred_dat$observed
sim_dat <- sdmTMB_simulate(
formula = ~ 1 + a1,
data = predictor_dat,
mesh = mesh,
family = gaussian(),
range = 0.5,
phi = 0.1,
sigma_O = 0.2,
seed = i * 91818,
B = c(0.2, 0.4) # B0 = intercept, B1 = a1 slope
)
fit <- sdmTMB(observed ~ a1, data = sim_dat, mesh = mesh)
fit
bins <- seq(0, 1, 0.1)
sim_dat$Xbin <- bins[findInterval(sim_dat$X, bins)]
sim_dat$Ybin <- bins[findInterval(sim_dat$Y, bins)]
d <- group_by(sim_dat, Xbin, Ybin) |>
summarise(
X = Xbin[1], Y = Ybin[1], a1 = mean(a1),
obs_avg = mean(observed), .groups = "drop"
)
mesh_avg <- make_mesh(d, c("X", "Y"), cutoff = 0.1)
fit_avg <- sdmTMB(obs_avg ~ a1, data = d, mesh = mesh_avg)
fit_avg
fit_no_spatial <- lm(obs_avg ~ a1, data = d)
fit_no_spatial
e0 <- broom::tidy(fit_no_spatial, conf.int = TRUE) |>
select(term, estimate, conf.low, conf.high) |>
mutate(model = "binned lm")
e1 <- tidy(fit, conf.int = TRUE) |>
select(term, estimate, conf.low, conf.high) |>
mutate(model = "individual spatial")
e2 <- tidy(fit_avg, conf.int = TRUE) |>
select(term, estimate, conf.low, conf.high) |>
mutate(model = "binned spatial")
bind_rows(list(e0, e1, e2))
}
out <- furrr::future_map_dfr(
seq_len(100L), run_repeated_test,
.options = furrr::furrr_options(seed = TRUE)
)
```
Join on true values and calculate relative error:
```{r}
truth <- data.frame(
term = c("(Intercept)", "a1"), truth = c(0.2, 0.4),
stringsAsFactors = FALSE
)
out2 <- left_join(out, truth, by = join_by(term))
out2$RE <- (out2$estimate - out2$truth) / out2$truth
out2$covered <- out2$conf.high > out2$truth & out$conf.low < out2$truth
out2 <- filter(out2, term == "a1")
out2 <- out2 |>
ungroup() |>
arrange(model) |> # group_by(model, term) |>
mutate(iteration = seq_len(n()))
```
Look at relative error, 95% CI coverage, and median absolute relative error (MARE):
```{r}
#| fig.asp=0.5, fig.width=15
ggplot(out2, aes(iteration, estimate,
ymin = conf.low,
ymax = conf.high, colour = model
)) +
geom_pointrange() +
facet_wrap(~term, scales = "free_y") +
geom_hline(yintercept = 0.4, lty = 2)
```
```{r}
ggplot(out2, aes(model, RE)) +
geom_boxplot() +
geom_jitter(height = 0, width = 0.1, alpha = 0.5, pch = 21) +
facet_wrap(~term, scales = "free_y") +
geom_hline(yintercept = 0, lty = 2)
out2 |>
group_by(model, term) |>
summarise(mean_RE = mean(RE), MARE = median(abs(RE)), coverage = mean(covered), .groups = "drop") |>
arrange(term, rev(model)) |>
knitr::kable(digits = 2)
```
**Conclusion**: binning predictors and observations adds inaccuracy and reduces confidence interval coverage (CIs are too tight). Ignoring the spatial correlation (the `lm()` model) is really bad for accuracy and CI width.
# Do binned predictors need binned responses?
This shouldn't be the case, although I know it intuitively seems weird at first. Using average predictors is also known as group-level predictors and is a type of model commonly recommended in textbooks like [Gelman and Hill 2007](http://www.stat.columbia.edu/~gelman/arm/) on multilevel modeling.
Let's do the same thing as above but compare binned responses vs. individual responses (both with grouped predictors). There are 3 models here:
1. Individual observations, average predictors
2. Average observations, average predictors but keep same rows of data
3. Average observations, average predictors but condense down the data rows
```{r}
run_repeated_test2 <- function(i) {
set.seed(i)
N <- 1000
predictor_dat <- data.frame(
X = runif(N), Y = runif(N)
)
mesh <- make_mesh(predictor_dat, xy_cols = c("X", "Y"), cutoff = 0.1)
pred_dat <- sdmTMB_simulate(
formula = ~1,
data = predictor_dat,
mesh = mesh,
family = gaussian(),
range = 0.6,
phi = 0.05,
sigma_O = 0.4,
seed = i * 20121,
B = c(1)
)
predictor_dat$a1 <- pred_dat$observed
sim_dat <- sdmTMB_simulate(
formula = ~ 1 + a1,
data = predictor_dat,
mesh = mesh,
family = gaussian(),
range = 0.5,
phi = 0.1,
sigma_O = 0.2,
seed = i * 91818,
B = c(0.2, 0.4) # B0 = intercept, B1 = a1 slope
)
fit <- sdmTMB(observed ~ a1, data = sim_dat, mesh = mesh)
fit
bins <- seq(0, 1, 0.1)
sim_dat$Xbin <- bins[findInterval(sim_dat$X, bins)]
sim_dat$Ybin <- bins[findInterval(sim_dat$Y, bins)]
d <- group_by(sim_dat, Xbin, Ybin) |>
mutate( #< Changed here
X = Xbin, Y = Ybin, a1 = mean(a1), obs_avg = mean(observed), obs = observed, #< Changed here
.groups = "drop"
)
mesh_avg <- make_mesh(d, c("X", "Y"), cutoff = 0.1)
fit_obs_ind <- sdmTMB(obs ~ a1, data = d, mesh = mesh_avg) #< Changed here
fit_obs_avg <- sdmTMB(obs_avg ~ a1, data = d, mesh = mesh_avg) #< Changed here
d_condensed <- group_by(sim_dat, Xbin, Ybin) |>
summarize(
X = Xbin[1], Y = Ybin[1], a1 = mean(a1), obs_avg = mean(observed),
.groups = "drop"
)
mesh_avg <- make_mesh(d_condensed, c("X", "Y"), cutoff = 0.1)
fit_obs_avg_cond <- sdmTMB(obs_avg ~ a1, data = d_condensed, mesh = mesh_avg)
# e0 <- broom::tidy(fit_no_spatial, conf.int = TRUE) |>
# select(term, estimate, conf.low, conf.high) |>
# mutate(model = "binned lm")
e1 <- tidy(fit_obs_ind, conf.int = TRUE) |>
select(term, estimate, conf.low, conf.high) |>
mutate(model = "individual observations, binned predictor")
e2 <- tidy(fit_obs_avg, conf.int = TRUE) |>
select(term, estimate, conf.low, conf.high) |>
mutate(model = "binned observations, binned predictor")
e3 <- tidy(fit_obs_avg_cond, conf.int = TRUE) |>
select(term, estimate, conf.low, conf.high) |>
mutate(model = "binned observations, binned predictor, condensed data")
bind_rows(list(e1, e2, e3))
}
out <- furrr::future_map_dfr(
seq_len(150L), run_repeated_test2,
.options = furrr::furrr_options(seed = TRUE)
)
```
Join on true values and calculate relative error:
```{r}
truth <- data.frame(
term = c("(Intercept)", "a1"), truth = c(0.2, 0.4),
stringsAsFactors = FALSE
)
out2 <- left_join(out, truth, by = join_by(term))
out2$RE <- (out2$estimate - out2$truth) / out2$truth
out2$covered <- out2$conf.high > out2$truth & out$conf.low < out2$truth
out2 <- filter(out2, term == "a1")
out2 <- out2 |>
ungroup() |>
arrange(model) |> group_by(model, term) |>
mutate(iteration = seq_len(n()))
```
Look at relative error, 95% CI coverage, and median absolute relative error:
```{r}
#| fig.asp=1.2, fig.width=6
ggplot(out2, aes(iteration, estimate,
ymin = conf.low,
ymax = conf.high
)) +
geom_pointrange() +
facet_wrap(~term, scales = "free") +
geom_hline(yintercept = 0.4, lty = 2) +
facet_wrap(~model, nrow = 3)
```
```{r}
ggplot(out2, aes(model, RE)) +
geom_boxplot() +
geom_jitter(height = 0, width = 0.1, alpha = 0.5, pch = 21) +
facet_wrap(~term, scales = "free_y") +
geom_hline(yintercept = 0, lty = 2) +
coord_flip()
out2 |>
group_by(model, term) |>
summarise(mean_RE = mean(RE), MARE = median(abs(RE)), coverage = mean(covered), .groups = "drop") |>
arrange(term, rev(model)) |>
knitr::kable(digits = 2, format = "pipe")
```
Plot individual response estimates against binned/averaged response estimates:
```{r}
out2 <- out2 |>
ungroup() |>
group_by(model, term) |>
mutate(iteration = seq_len(n()))
d1 <- filter(out2, model == "binned observations, binned predictor, condensed data") |> ungroup()
d2 <- filter(out2, model == "individual observations, binned predictor") |> ungroup()
names(d1)[c(2, 3, 4)] <- paste0("binned_", names(d1)[c(2, 3, 4)])
names(d2)[c(2, 3, 4)] <- paste0("ind_", names(d2)[c(2, 3, 4)])
d <- cbind(d1, select(d2, 2:4))
ggplot(d, aes(ind_estimate, binned_estimate)) + geom_point(pch = 21) +
geom_linerange(aes(ymin = binned_conf.low, ymax = binned_conf.high)) +
geom_linerange(aes(xmin = ind_conf.low, xmax = ind_conf.high)) +
geom_abline(intercept = 0, slope = 1, lty = 2) +
geom_smooth(method = "lm", col = "blue", se = FALSE, formula = 'y ~ x')
```
```{r}
cor(d$ind_estimate, d$binned_estimate)^2
```
**Conclusion**:
- Keeping all the rows of data but replacing predictors and observations with average values leads to confidence intervals that are way too small.
- Condensing the dataset to single averages for each bin for both predictors and observations results in a similar result to condensing the predictor into averages but leaving the observations as individual-level observations.
- If anything, all else equal, working with individual-level observations but averaged predictors tends to make the estimated effect sizes *larger* compared to working with averaged predictors and averaged observations (last plot).
- None of these options are as good as having individual-level predictors and observations.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment