Created
March 24, 2023 02:13
-
-
Save seananderson/2a7f8a36789a65031c7beb1ed8d9dcfc to your computer and use it in GitHub Desktop.
Spatial binning vs. individual-level observations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Individual vs. blocked spatial data modelling" | |
format: | |
html: | |
embed-resources: true | |
editor: source | |
execute: | |
echo: true | |
eval: true | |
cache: true | |
--- | |
```{r} | |
#| cache=FALSE, | |
#| message=FALSE, warning=FALSE | |
library(dplyr) | |
library(ggplot2) | |
library(sdmTMB) | |
library(future) | |
plan(multisession) | |
theme_set(theme_light()) | |
options(ggplot2.continuous.colour = "viridis") | |
``` | |
# A one-off example | |
Compare individual predictors and individual observations vs. binned predictors with binned observations. | |
```{r} | |
set.seed(123) | |
N <- 1000 | |
predictor_dat <- data.frame( | |
X = runif(N), Y = runif(N) | |
) | |
mesh <- make_mesh(predictor_dat, xy_cols = c("X", "Y"), cutoff = 0.1) | |
``` | |
Make spatially correlated predictor `a1`: | |
```{r} | |
pred_dat <- sdmTMB_simulate( | |
formula = ~1, | |
data = predictor_dat, | |
mesh = mesh, | |
family = gaussian(), | |
range = 0.7, | |
phi = 0.02, | |
sigma_O = 0.4, | |
seed = 1928, | |
B = c(1) | |
) | |
predictor_dat$a1 <- pred_dat$observed | |
ggplot(predictor_dat, aes(X, Y, colour = a1)) + | |
geom_point() + | |
scale_color_viridis_c() | |
``` | |
Simulate observations with an intercept of 0.2 and a slope of 0.4 on the predictor a1: | |
```{r} | |
sim_dat <- sdmTMB_simulate( | |
formula = ~ 1 + a1, | |
data = predictor_dat, | |
mesh = mesh, | |
family = gaussian(), | |
range = 0.5, | |
phi = 0.1, | |
sigma_O = 0.2, | |
seed = 42, | |
B = c(0.2, 0.4) # B0 = intercept, B1 = a1 slope | |
) | |
ggplot(sim_dat, aes(X, Y, colour = observed)) + | |
geom_point() + | |
scale_color_viridis_c() | |
``` | |
Fit a spatial model: | |
```{r} | |
fit <- sdmTMB(observed ~ a1, data = sim_dat, mesh = mesh) | |
fit | |
``` | |
Aggregate into spatial blocks: | |
```{r} | |
bins <- seq(0, 1, 0.1) | |
sim_dat$Xbin <- bins[findInterval(sim_dat$X, bins)] | |
sim_dat$Ybin <- bins[findInterval(sim_dat$Y, bins)] | |
ggplot(sim_dat, aes(Xbin, Ybin, colour = observed)) + | |
geom_point() + | |
scale_color_viridis_c() | |
d <- group_by(sim_dat, Xbin, Ybin) |> | |
summarise( | |
X = Xbin[1], Y = Ybin[1], a1_avg = mean(a1), | |
obs_avg = mean(observed), .groups = "drop" | |
) | |
ggplot(d, aes(X, Y, colour = a1_avg)) + | |
geom_point() + | |
scale_color_viridis_c() | |
ggplot(d, aes(X, Y, colour = obs_avg)) + | |
geom_point() + | |
scale_color_viridis_c() | |
``` | |
Fit a spatial and non-spatial (`lm()`) model to the blocked data: | |
```{r} | |
mesh_avg <- make_mesh(d, c("X", "Y"), cutoff = 0.1) | |
fit_avg <- sdmTMB(obs_avg ~ a1_avg, data = d, mesh = mesh_avg) | |
fit_avg | |
fit_no_spatial <- lm(obs_avg ~ a1_avg, data = d) | |
fit_no_spatial | |
broom::tidy(fit_no_spatial) | |
tidy(fit) | |
tidy(fit_avg) | |
``` | |
# A repeated example to look at estimation performance | |
Do it many times and look at relative error (RE), CI coverage, and median absolute relative error (accuracy) on the 'a1' coefficient estimate: | |
```{r} | |
run_repeated_test <- function(i) { | |
set.seed(i) | |
N <- 1000 | |
predictor_dat <- data.frame( | |
X = runif(N), Y = runif(N) | |
) | |
mesh <- make_mesh(predictor_dat, xy_cols = c("X", "Y"), cutoff = 0.1) | |
pred_dat <- sdmTMB_simulate( | |
formula = ~1, | |
data = predictor_dat, | |
mesh = mesh, | |
family = gaussian(), | |
# range = 0.1, | |
range = 0.6, | |
# phi = 0.2, | |
phi = 0.05, | |
# sigma_O = 0.1, | |
sigma_O = 0.4, | |
seed = i * 20121, | |
B = c(1) | |
) | |
predictor_dat$a1 <- pred_dat$observed | |
sim_dat <- sdmTMB_simulate( | |
formula = ~ 1 + a1, | |
data = predictor_dat, | |
mesh = mesh, | |
family = gaussian(), | |
range = 0.5, | |
phi = 0.1, | |
sigma_O = 0.2, | |
seed = i * 91818, | |
B = c(0.2, 0.4) # B0 = intercept, B1 = a1 slope | |
) | |
fit <- sdmTMB(observed ~ a1, data = sim_dat, mesh = mesh) | |
fit | |
bins <- seq(0, 1, 0.1) | |
sim_dat$Xbin <- bins[findInterval(sim_dat$X, bins)] | |
sim_dat$Ybin <- bins[findInterval(sim_dat$Y, bins)] | |
d <- group_by(sim_dat, Xbin, Ybin) |> | |
summarise( | |
X = Xbin[1], Y = Ybin[1], a1 = mean(a1), | |
obs_avg = mean(observed), .groups = "drop" | |
) | |
mesh_avg <- make_mesh(d, c("X", "Y"), cutoff = 0.1) | |
fit_avg <- sdmTMB(obs_avg ~ a1, data = d, mesh = mesh_avg) | |
fit_avg | |
fit_no_spatial <- lm(obs_avg ~ a1, data = d) | |
fit_no_spatial | |
e0 <- broom::tidy(fit_no_spatial, conf.int = TRUE) |> | |
select(term, estimate, conf.low, conf.high) |> | |
mutate(model = "binned lm") | |
e1 <- tidy(fit, conf.int = TRUE) |> | |
select(term, estimate, conf.low, conf.high) |> | |
mutate(model = "individual spatial") | |
e2 <- tidy(fit_avg, conf.int = TRUE) |> | |
select(term, estimate, conf.low, conf.high) |> | |
mutate(model = "binned spatial") | |
bind_rows(list(e0, e1, e2)) | |
} | |
out <- furrr::future_map_dfr( | |
seq_len(100L), run_repeated_test, | |
.options = furrr::furrr_options(seed = TRUE) | |
) | |
``` | |
Join on true values and calculate relative error: | |
```{r} | |
truth <- data.frame( | |
term = c("(Intercept)", "a1"), truth = c(0.2, 0.4), | |
stringsAsFactors = FALSE | |
) | |
out2 <- left_join(out, truth, by = join_by(term)) | |
out2$RE <- (out2$estimate - out2$truth) / out2$truth | |
out2$covered <- out2$conf.high > out2$truth & out$conf.low < out2$truth | |
out2 <- filter(out2, term == "a1") | |
out2 <- out2 |> | |
ungroup() |> | |
arrange(model) |> # group_by(model, term) |> | |
mutate(iteration = seq_len(n())) | |
``` | |
Look at relative error, 95% CI coverage, and median absolute relative error (MARE): | |
```{r} | |
#| fig.asp=0.5, fig.width=15 | |
ggplot(out2, aes(iteration, estimate, | |
ymin = conf.low, | |
ymax = conf.high, colour = model | |
)) + | |
geom_pointrange() + | |
facet_wrap(~term, scales = "free_y") + | |
geom_hline(yintercept = 0.4, lty = 2) | |
``` | |
```{r} | |
ggplot(out2, aes(model, RE)) + | |
geom_boxplot() + | |
geom_jitter(height = 0, width = 0.1, alpha = 0.5, pch = 21) + | |
facet_wrap(~term, scales = "free_y") + | |
geom_hline(yintercept = 0, lty = 2) | |
out2 |> | |
group_by(model, term) |> | |
summarise(mean_RE = mean(RE), MARE = median(abs(RE)), coverage = mean(covered), .groups = "drop") |> | |
arrange(term, rev(model)) |> | |
knitr::kable(digits = 2) | |
``` | |
**Conclusion**: binning predictors and observations adds inaccuracy and reduces confidence interval coverage (CIs are too tight). Ignoring the spatial correlation (the `lm()` model) is really bad for accuracy and CI width. | |
# Do binned predictors need binned responses? | |
This shouldn't be the case, although I know it intuitively seems weird at first. Using average predictors is also known as group-level predictors and is a type of model commonly recommended in textbooks like [Gelman and Hill 2007](http://www.stat.columbia.edu/~gelman/arm/) on multilevel modeling. | |
Let's do the same thing as above but compare binned responses vs. individual responses (both with grouped predictors). There are 3 models here: | |
1. Individual observations, average predictors | |
2. Average observations, average predictors but keep same rows of data | |
3. Average observations, average predictors but condense down the data rows | |
```{r} | |
run_repeated_test2 <- function(i) { | |
set.seed(i) | |
N <- 1000 | |
predictor_dat <- data.frame( | |
X = runif(N), Y = runif(N) | |
) | |
mesh <- make_mesh(predictor_dat, xy_cols = c("X", "Y"), cutoff = 0.1) | |
pred_dat <- sdmTMB_simulate( | |
formula = ~1, | |
data = predictor_dat, | |
mesh = mesh, | |
family = gaussian(), | |
range = 0.6, | |
phi = 0.05, | |
sigma_O = 0.4, | |
seed = i * 20121, | |
B = c(1) | |
) | |
predictor_dat$a1 <- pred_dat$observed | |
sim_dat <- sdmTMB_simulate( | |
formula = ~ 1 + a1, | |
data = predictor_dat, | |
mesh = mesh, | |
family = gaussian(), | |
range = 0.5, | |
phi = 0.1, | |
sigma_O = 0.2, | |
seed = i * 91818, | |
B = c(0.2, 0.4) # B0 = intercept, B1 = a1 slope | |
) | |
fit <- sdmTMB(observed ~ a1, data = sim_dat, mesh = mesh) | |
fit | |
bins <- seq(0, 1, 0.1) | |
sim_dat$Xbin <- bins[findInterval(sim_dat$X, bins)] | |
sim_dat$Ybin <- bins[findInterval(sim_dat$Y, bins)] | |
d <- group_by(sim_dat, Xbin, Ybin) |> | |
mutate( #< Changed here | |
X = Xbin, Y = Ybin, a1 = mean(a1), obs_avg = mean(observed), obs = observed, #< Changed here | |
.groups = "drop" | |
) | |
mesh_avg <- make_mesh(d, c("X", "Y"), cutoff = 0.1) | |
fit_obs_ind <- sdmTMB(obs ~ a1, data = d, mesh = mesh_avg) #< Changed here | |
fit_obs_avg <- sdmTMB(obs_avg ~ a1, data = d, mesh = mesh_avg) #< Changed here | |
d_condensed <- group_by(sim_dat, Xbin, Ybin) |> | |
summarize( | |
X = Xbin[1], Y = Ybin[1], a1 = mean(a1), obs_avg = mean(observed), | |
.groups = "drop" | |
) | |
mesh_avg <- make_mesh(d_condensed, c("X", "Y"), cutoff = 0.1) | |
fit_obs_avg_cond <- sdmTMB(obs_avg ~ a1, data = d_condensed, mesh = mesh_avg) | |
# e0 <- broom::tidy(fit_no_spatial, conf.int = TRUE) |> | |
# select(term, estimate, conf.low, conf.high) |> | |
# mutate(model = "binned lm") | |
e1 <- tidy(fit_obs_ind, conf.int = TRUE) |> | |
select(term, estimate, conf.low, conf.high) |> | |
mutate(model = "individual observations, binned predictor") | |
e2 <- tidy(fit_obs_avg, conf.int = TRUE) |> | |
select(term, estimate, conf.low, conf.high) |> | |
mutate(model = "binned observations, binned predictor") | |
e3 <- tidy(fit_obs_avg_cond, conf.int = TRUE) |> | |
select(term, estimate, conf.low, conf.high) |> | |
mutate(model = "binned observations, binned predictor, condensed data") | |
bind_rows(list(e1, e2, e3)) | |
} | |
out <- furrr::future_map_dfr( | |
seq_len(150L), run_repeated_test2, | |
.options = furrr::furrr_options(seed = TRUE) | |
) | |
``` | |
Join on true values and calculate relative error: | |
```{r} | |
truth <- data.frame( | |
term = c("(Intercept)", "a1"), truth = c(0.2, 0.4), | |
stringsAsFactors = FALSE | |
) | |
out2 <- left_join(out, truth, by = join_by(term)) | |
out2$RE <- (out2$estimate - out2$truth) / out2$truth | |
out2$covered <- out2$conf.high > out2$truth & out$conf.low < out2$truth | |
out2 <- filter(out2, term == "a1") | |
out2 <- out2 |> | |
ungroup() |> | |
arrange(model) |> group_by(model, term) |> | |
mutate(iteration = seq_len(n())) | |
``` | |
Look at relative error, 95% CI coverage, and median absolute relative error: | |
```{r} | |
#| fig.asp=1.2, fig.width=6 | |
ggplot(out2, aes(iteration, estimate, | |
ymin = conf.low, | |
ymax = conf.high | |
)) + | |
geom_pointrange() + | |
facet_wrap(~term, scales = "free") + | |
geom_hline(yintercept = 0.4, lty = 2) + | |
facet_wrap(~model, nrow = 3) | |
``` | |
```{r} | |
ggplot(out2, aes(model, RE)) + | |
geom_boxplot() + | |
geom_jitter(height = 0, width = 0.1, alpha = 0.5, pch = 21) + | |
facet_wrap(~term, scales = "free_y") + | |
geom_hline(yintercept = 0, lty = 2) + | |
coord_flip() | |
out2 |> | |
group_by(model, term) |> | |
summarise(mean_RE = mean(RE), MARE = median(abs(RE)), coverage = mean(covered), .groups = "drop") |> | |
arrange(term, rev(model)) |> | |
knitr::kable(digits = 2, format = "pipe") | |
``` | |
Plot individual response estimates against binned/averaged response estimates: | |
```{r} | |
out2 <- out2 |> | |
ungroup() |> | |
group_by(model, term) |> | |
mutate(iteration = seq_len(n())) | |
d1 <- filter(out2, model == "binned observations, binned predictor, condensed data") |> ungroup() | |
d2 <- filter(out2, model == "individual observations, binned predictor") |> ungroup() | |
names(d1)[c(2, 3, 4)] <- paste0("binned_", names(d1)[c(2, 3, 4)]) | |
names(d2)[c(2, 3, 4)] <- paste0("ind_", names(d2)[c(2, 3, 4)]) | |
d <- cbind(d1, select(d2, 2:4)) | |
ggplot(d, aes(ind_estimate, binned_estimate)) + geom_point(pch = 21) + | |
geom_linerange(aes(ymin = binned_conf.low, ymax = binned_conf.high)) + | |
geom_linerange(aes(xmin = ind_conf.low, xmax = ind_conf.high)) + | |
geom_abline(intercept = 0, slope = 1, lty = 2) + | |
geom_smooth(method = "lm", col = "blue", se = FALSE, formula = 'y ~ x') | |
``` | |
```{r} | |
cor(d$ind_estimate, d$binned_estimate)^2 | |
``` | |
**Conclusion**: | |
- Keeping all the rows of data but replacing predictors and observations with average values leads to confidence intervals that are way too small. | |
- Condensing the dataset to single averages for each bin for both predictors and observations results in a similar result to condensing the predictor into averages but leaving the observations as individual-level observations. | |
- If anything, all else equal, working with individual-level observations but averaged predictors tends to make the estimated effect sizes *larger* compared to working with averaged predictors and averaged observations (last plot). | |
- None of these options are as good as having individual-level predictors and observations. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment