Skip to content

Instantly share code, notes, and snippets.

@noamross
Last active January 21, 2023 20:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save noamross/ca6906e32f897e3729de40610beffcd1 to your computer and use it in GitHub Desktop.
Save noamross/ca6906e32f897e3729de40610beffcd1 to your computer and use it in GitHub Desktop.
Testing reduced-rank missing data strategies with mgcv
# Testing missing data strategies with mgcv
library(mgcv)
library(MRFtools) # github.com/noamross/MRFtools
library(tidyverse)
# Simulate data with missing parts
n <- 350;set.seed(2)
dat <- gamSim(1,n=n,scale=3) ## 1 or 7
drop <- sample(1:n,300) ## to
for (i in 2:5) dat[drop[1:75+(i-2)*75],i] <- NA
dname <- names(dat)[2:5]
dat1 <- dat
for (i in 1:4) {
by.name <- paste("m",dname[i],sep="")
dat1[[by.name]] <- is.na(dat1[[dname[i]]])
dat1[[dname[i]]][dat1[[by.name]]] <- mean(dat1[[dname[i]]],na.rm=TRUE)
lev <- rep(1,n);lev[dat1[[by.name]]] <- 1:sum(dat1[[by.name]])
id.name <- paste("id",dname[i],sep="")
dat1[[id.name]] <- factor(lev)
dat1[[by.name]] <- as.numeric(dat1[[by.name]])
}
# Fit a model per the ?mgcv::missing.data.example, modeling missing data as random effect levels
mod_re <- bam(y~s(x0,by=ordered(!mx0))+s(x1,by=ordered(!mx1))+
s(x2,by=ordered(!mx2))+s(x3,by=ordered(!mx3))+
s(idx0,bs="re",by=mx0)+s(idx1,bs="re",by=mx1)+
s(idx2,bs="re",by=mx2)+s(idx3,bs="re",by=mx3)
,data=dat1,discrete=TRUE)
# Fit a model using "mrf" instead of "re"
p0 <- MRFtools::mrf_penalty(dat1$idx0, type = "individual") # Same as diag(nlevels(dat$idx0))
p1 <- MRFtools::mrf_penalty(dat1$idx1, type = "individual")
p2 <- MRFtools::mrf_penalty(dat1$idx2, type = "individual")
p3 <- MRFtools::mrf_penalty(dat1$idx3, type = "individual")
mod_mrf_full <- bam(y~s(x0,by=ordered(!mx0))+s(x1,by=ordered(!mx1))+
s(x2,by=ordered(!mx2))+s(x3,by=ordered(!mx3))+
s(idx0,bs="mrf", xt = list(penalty = p0), by=mx0) +
s(idx1,bs="mrf", xt = list(penalty = p1), by=mx1) +
s(idx2,bs="mrf", xt = list(penalty = p2), by=mx2) +
s(idx3,bs="mrf", xt = list(penalty = p3), by=mx3)
,data=dat1,discrete=TRUE)
# Now again but used reduced-rank MRFs terms
mod_mrf_reduced <- bam(y~s(x0,by=ordered(!mx0))+s(x1,by=ordered(!mx1))+
s(x2,by=ordered(!mx2))+s(x3,by=ordered(!mx3))+
s(idx0,bs="mrf", xt = list(penalty = p0), by=mx0, k = 5) +
s(idx1,bs="mrf", xt = list(penalty = p1), by=mx1, k = 5) +
s(idx2,bs="mrf", xt = list(penalty = p2), by=mx2, k = 5) +
s(idx3,bs="mrf", xt = list(penalty = p3), by=mx3, k = 5)
,data=dat1,discrete=TRUE)
## fit the model to the `complete case' data...
mod_complete <- bam(y~s(x0)+s(x1)+s(x2)+s(x3),data=dat,method="REML")
# Plot all the model smooths for comparison
par(mfrow=c(4,4),mar=c(4,4,1,1))
for (i in 1:4) plot(mod_re,select=i) ## plot the smooth effects from random-effects model
for (i in 1:4) plot(mod_mrf_full,select=i) ## plot the smooth effects from mrf model
for (i in 1:4) plot(mod_mrf_reduced,select=i) ## plot the smooth effects from reduced-rank mrf model
for (i in 1:4) plot(mod_complete,select=i) ## plot the smooth effects from the complete data model
models <- list(
mod_re = mod_re,
mod_mrf_full = mod_mrf_full,
mod_mrf_reduced = mod_mrf_reduced,
mod_complete = mod_complete
)
tibble(
model = names(models),
summary = map(models, summary),
n_coef = map_int(models, \(x) length(coef(x))),
r_sq = map_dbl(summary, "r.sq"),
dev_expl = map_dbl(summary, "dev.expl"),
sigma = map_dbl(models, "sig2")
) |>
bind_cols(map_dfr(models, \(x) {out <- as.data.frame(t(summary(x)$edf[1:4])); colnames(out) <- paste0("edf", 1:4); out}))
# How do predictions and performance compare?
par(mfrow = c(1,1))
plot(predict(mod_re), predict(mod_mrf_reduced))
cor(predict(mod_re), predict(mod_mrf_reduced))
# But what if we only look at complete data?
cc <- complete.cases(dat)
plot(predict(mod_re)[cc], predict(mod_mrf_reduced)[cc])
cor(predict(mod_re)[cc], predict(mod_mrf_reduced)[cc])
# How about individual terms?
par(mfrow = c(2,2))
plot(
predict(mod_re, type = "terms", terms = "s(x0):ordered(!mx0)TRUE")[cc,1],
predict(mod_mrf_reduced, type = "terms", terms = "s(x0):ordered(!mx0)TRUE")[cc,1]
)
plot(
predict(mod_re, type = "terms", terms = "s(x1):ordered(!mx1)TRUE")[cc,1],
predict(mod_mrf_reduced, type = "terms", terms = "s(x1):ordered(!mx1)TRUE")[cc,1]
)
plot(
predict(mod_re, type = "terms", terms = "s(x2):ordered(!mx2)TRUE")[cc,1],
predict(mod_mrf_reduced, type = "terms", terms = "s(x2):ordered(!mx2)TRUE")[cc,1]
)
plot(
predict(mod_re, type = "terms", terms = "s(x3):ordered(!mx3)TRUE")[cc,1],
predict(mod_mrf_reduced, type = "terms", terms = "s(x3):ordered(!mx3)TRUE")[cc,1]
)
# How do coefficients and variances compare?
par(mfrow = c(3,2))
vc_re <- vcov(mod_re)
vc_mrf_reduced <- vcov(mod_mrf_reduced)
re_levels_0 <- stringi::stri_subset_fixed(rownames(vc_re), "s(idx0):")
mrf_levels_0 <- stringi::stri_subset_fixed(rownames(vc_mrf_reduced), "s(idx0):")
plot(coef(mod_re)[re_levels_0])
plot(coef(mod_mrf_reduced)[mrf_levels_0])
plot(sqrt(diag(vc_re[re_levels_0, re_levels_0])))
plot(sqrt(diag(vc_mrf_reduced[mrf_levels_0, mrf_levels_0])))
image(vc_re[re_levels_0, re_levels_0])
image(vc_mrf_reduced[mrf_levels_0, mrf_levels_0])
# How about looking at the variance component?
options(scipen = 10)
gam.vcomp(mod_re)
gam.vcomp(mod_mrf_full)
gam.vcomp(mod_mrf_reduced)
# Can we drop large penalty matrices from memory?
mod_mrf_reduced <- bam(y~s(x0,by=ordered(!mx0))+s(x1,by=ordered(!mx1))+
s(x2,by=ordered(!mx2))+s(x3,by=ordered(!mx3))+
s(idx0,bs="mrf", xt = list(penalty = p0), by=mx0, k = 5) +
s(idx1,bs="mrf", xt = list(penalty = p1), by=mx1, k = 5) +
s(idx2,bs="mrf", xt = list(penalty = p2), by=mx2, k = 5) +
s(idx3,bs="mrf", xt = list(penalty = p3), by=mx3, k = 5)
,data=dat1,discrete=TRUE, fit = FALSE)
for(i in 5:8) {
mod_mrf_reduced$smooth[[i]]$xt <- NULL
}
out <- bam(G=mod_mrf_reduced)
plot(out)
predict(out)
gam.check(out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment