Last active
July 1, 2021 17:18
-
-
Save alexpkeil1/ff169e6933de2bd2f302f1fd6423e74e to your computer and use it in GitHub Desktop.
quantile g-computation with a suppressed intercept
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(qgcomp) | |
msm.fit.noint <- function(f, | |
qdata, | |
intvals, | |
expnms, | |
rr=TRUE, | |
main=TRUE, | |
degree=1, | |
id=NULL, | |
weights, | |
bayes=FALSE, | |
MCsize=nrow(qdata), ...){ | |
newform <- terms(f, data = qdata) | |
nobs = nrow(qdata) | |
thecall <- match.call(expand.dots = FALSE) | |
names(thecall) <- gsub("qdata", "data", names(thecall)) | |
names(thecall) <- gsub("f", "formula", names(thecall)) | |
m <- match(c("formula", "data", "weights", "offset"), names(thecall), 0L) | |
hasweights = ifelse("weights" %in% names(thecall), TRUE, FALSE) | |
thecall <- thecall[c(1L, m)] | |
thecall$drop.unused.levels <- TRUE | |
thecall[[1L]] <- quote(stats::model.frame) | |
thecalle <- eval(thecall, parent.frame()) | |
if(hasweights){ | |
qdata$weights <- as.vector(model.weights(thecalle)) | |
} else qdata$weights = rep(1, nobs) | |
if(is.null(id)) { | |
id <- "id__" | |
qdata$id__ <- seq_len(dim(qdata)[1]) | |
} | |
# conditional outcome regression fit | |
nidx = which(!(names(qdata) %in% id)) | |
if(!bayes) fit <- glm(newform, data = qdata, | |
weights=weights, | |
...) | |
if(bayes){ | |
requireNamespace("arm") | |
fit <- bayesglm(f, data = qdata[,nidx,drop=FALSE], | |
weights=weights, | |
...) | |
} | |
if(fit$family$family %in% c("gaussian", "poisson")) rr=FALSE | |
### | |
# get predictions (set exposure to 0,1,...,q-1) | |
if(is.null(intvals)){ | |
intvals <- (seq_len(length(table(qdata[expnms[1]])))) - 1 | |
} | |
predit <- function(idx, newdata){ | |
#newdata <- qdata | |
newdata[,expnms] <- idx | |
suppressWarnings(predict(fit, newdata=newdata, type='response')) | |
} | |
if(MCsize==nrow(qdata)){ | |
newdata <- qdata | |
}else{ | |
newids <- data.frame(temp=sort(sample(unique(qdata[,id, drop=TRUE]), MCsize, | |
#probs=weights, #bootstrap sampling with weights works with fixed weights, but not time-varying weights | |
replace = TRUE | |
))) | |
names(newids) <- id | |
newdata <- merge(qdata,newids, by=id, all.x=FALSE, all.y=TRUE)[seq_len(MCsize),] | |
} | |
predmat = lapply(intvals, predit, newdata=newdata) | |
# fit MSM using g-computation estimates of expected outcomes under joint | |
# intervention | |
#nobs <- dim(qdata)[1] | |
msmdat <- data.frame( | |
cbind( | |
Ya = unlist(predmat), | |
psi = rep(intvals, each=MCsize), | |
weights = rep(newdata$weights, times=length(intvals)) | |
#times=length(table(qdata[expnms[1]]))) | |
) | |
) | |
# to do: allow functional form variations for the MSM via specifying the model formula | |
if(bayes){ | |
if(!rr) suppressWarnings(msmfit <- bayesglm(Ya ~ -1 + poly(psi, degree=degree, raw=TRUE), data=msmdat, | |
weights=weights, x=TRUE, | |
...)) | |
if(rr) suppressWarnings(msmfit <- bayesglm(Ya ~ -1 + poly(psi, degree=degree, raw=TRUE), data=msmdat, | |
family=binomial(link='log'), start=rep(-0.0001, degree+1), | |
weights=weights, x=TRUE)) | |
} | |
if(!bayes){ | |
if(!rr) suppressWarnings(msmfit <- glm(Ya ~ -1 + poly(psi, degree=degree, raw=TRUE), data=msmdat, | |
weights=weights, x=TRUE, | |
...)) | |
if(rr) suppressWarnings(msmfit <- glm(Ya ~ -1 + poly(psi, degree=degree, raw=TRUE), data=msmdat, | |
family=binomial(link='log'), start=rep(-0.0001, degree+1), | |
weights=weights, x=TRUE)) | |
} | |
res <- list(fit=fit, msmfit=msmfit) | |
if(main) { | |
res$Ya <- msmdat$Ya # expected outcome under joint exposure, by gcomp | |
res$Yamsm <- predict(msmfit, type='response') | |
res$Yamsml <- predict(msmfit, type="link") | |
res$A <- msmdat$psi # joint exposure (0 = all exposures set category with | |
# upper cut-point as first quantile) | |
} | |
res | |
} | |
qgcomp.boot.noint <- function(f, | |
data, | |
expnms=NULL, | |
q=4, | |
breaks=NULL, | |
id=NULL, | |
weights, | |
alpha=0.05, | |
B=200, | |
rr=TRUE, | |
degree=1, | |
seed=NULL, | |
bayes=FALSE, | |
MCsize=nrow(data), | |
parallel=FALSE, ...){ | |
oldq = NULL | |
if(is.null(seed)) seed = round(runif(1, min=0, max=1e8)) | |
newform <- terms(f, data = data) | |
class(newform) <- "formula" | |
nobs = nrow(data) | |
origcall <- thecall <- match.call(expand.dots = FALSE) | |
names(thecall) <- gsub("f", "formula", names(thecall)) | |
m <- match(c("formula", "data", "weights", "offset"), names(thecall), 0L) | |
hasweights = ifelse("weights" %in% names(thecall), TRUE, FALSE) | |
thecall <- thecall[c(1L, m)] | |
thecall$drop.unused.levels <- TRUE | |
thecall[[1L]] <- quote(stats::model.frame) | |
thecalle <- eval(thecall, parent.frame()) | |
if(hasweights){ | |
data$weights <- as.vector(model.weights(thecalle)) | |
} else data$weights = rep(1, nobs) | |
if (is.null(expnms)) { | |
#expnms <- attr(terms(f, data = data), "term.labels") | |
expnms <- attr(newform, "term.labels") | |
message("Including all model terms as exposures of interest\n") | |
} | |
lin = qgcomp:::checknames(expnms) | |
if(!lin) stop("Model appears to be non-linear and I'm having trouble parsing it: | |
please use `expnms` parameter to define the variables making up the exposure") | |
if (!is.null(q) & !is.null(breaks)){ | |
# if user specifies breaks, prioritize those | |
oldq = q | |
q <- NULL | |
} | |
if (!is.null(q) | !is.null(breaks)){ | |
ql <- quantize(data, expnms, q, breaks) | |
qdata <- ql$data | |
br <- ql$breaks | |
if(is.null(q)){ | |
# rare scenario with user specified breaks and q is left at NULL | |
nvals <- length(br[[1]])-1 | |
} else{ | |
nvals <- q | |
} | |
intvals <- (seq_len(nvals))-1 | |
} else { | |
# if( is.null(breaks) & is.null(q)) # also includes NA | |
qdata <- data | |
# if no transformation is made (no quantiles, no breaks given) | |
# then draw distribution values from quantiles of all the exposures | |
# pooled together | |
# : allow user specification of this | |
nvals = length(table(unlist(data[,expnms]))) | |
if(nvals < 10){ | |
message("\nNote: using all possible values of exposure as the | |
intervention values\n") | |
p = length(expnms) | |
intvals <- as.numeric(names(table(unlist(data[,expnms])))) | |
br <- lapply(seq_len(p), function(x) c(-1e16, intvals[2:nvals]-1e-16, 1e16)) | |
}else{ | |
message("\nNote: using quantiles of all exposures combined in order to set | |
proposed intervention values for overall effect (25th, 50th, 75th %ile) | |
You can ensure this is valid by scaling all variables in expnms to have similar ranges.") | |
intvals = as.numeric(quantile(unlist(data[,expnms]), c(.25, .5, .75))) | |
br <- NULL | |
} | |
} | |
if(is.null(id)) { | |
id <- "id__" | |
qdata$id__ <- seq_len(dim(qdata)[1]) | |
} | |
### | |
msmfit <- msm.fit.noint(newform, qdata, intvals, expnms, rr, main=TRUE,degree=degree, id=id, | |
weights, | |
bayes, | |
MCsize=MCsize, | |
...) | |
# main estimate | |
#estb <- as.numeric(msmfit$msmfit$coefficients[-1]) | |
estb <- as.numeric(msmfit$msmfit$coefficients) | |
#bootstrap to get std. error | |
nobs <- dim(qdata)[1] | |
nids <- length(unique(qdata[,id, drop=TRUE])) | |
starttime = Sys.time() | |
psi.only <- function(i=1, f=f, qdata=qdata, intvals=intvals, expnms=expnms, rr=rr, degree=degree, | |
nids=nids, id=id, | |
weights,MCsize=MCsize, | |
...){ | |
if(i==2 & !parallel){ | |
timeiter = as.numeric(Sys.time() - starttime) | |
if((timeiter*B/60)>0.5) message(paste0("Expected time to finish: ", round(B*timeiter/60, 2), " minutes \n")) | |
} | |
bootids <- data.frame(temp=sort(sample(unique(qdata[,id, drop=TRUE]), nids, | |
replace = TRUE | |
))) | |
names(bootids) <- id | |
qdata_ <- merge(qdata,bootids, by=id, all.x=FALSE, all.y=TRUE) | |
ft = msm.fit.noint(f, qdata_, intvals, expnms, rr, main=FALSE, degree, id, weights=weights, bayes, MCsize=MCsize, | |
...) | |
yhatty = data.frame(yhat=predict(ft$msmfit), psi=ft$msmfit$data[,"psi"]) | |
as.numeric( | |
# the yhat estimates will be identical across individuals due to this being a marginal model | |
c(ft$msmfit$coefficients, with(yhatty, tapply(yhat, psi, mean))) | |
) | |
} | |
set.seed(seed) | |
if(parallel){ | |
#Sys.setenv(R_FUTURE_SUPPORTSMULTICORE_UNSTABLE="quiet") | |
future::plan(strategy = future::multisession) | |
bootsamps <- future.apply::future_lapply(X=seq_len(B), FUN=psi.only,f=f, qdata=qdata, intvals=intvals, | |
expnms=expnms, rr=rr, degree=degree, nids=nids, id=id, | |
weights=qdata$weights,MCsize=MCsize, | |
future.seed=TRUE, | |
...) | |
future::plan(strategy = future::transparent) | |
}else{ | |
bootsamps <- lapply(X=seq_len(B), FUN=psi.only,f=f, qdata=qdata, intvals=intvals, | |
expnms=expnms, rr=rr, degree=degree, nids=nids, id=id, | |
weights=weights, MCsize=MCsize, | |
...) | |
} | |
bootsamps = do.call("cbind", bootsamps) | |
# these are the linear predictors | |
hats = t(bootsamps[-c(seq_len(degree)),]) | |
# covariance of the linear predictors | |
cov.yhat = cov(hats) | |
bootsamps = bootsamps[seq_len(degree),, drop=FALSE] | |
seb <- apply(bootsamps, 1, sd) | |
covmat <- cov(t(bootsamps)) | |
colnames(covmat) <- rownames(covmat) <- names(estb) <- c(paste0("psi", seq_len(nrow(bootsamps)))) | |
tstat <- estb / seb | |
df <- nobs - length(attr(terms(f, data = data), "term.labels")) - 1 - degree # df based on obs - gcomp terms - msm terms | |
pval <- 2 - 2 * pt(abs(tstat), df = df) | |
pvalz <- 2 - 2 * pnorm(abs(tstat)) | |
ci <- cbind(estb + seb * qnorm(alpha / 2), estb + seb * qnorm(1 - alpha / 2)) | |
# outcome 'weights' not applicable in this setting, generally (i.e. if using this function for non-linearity, | |
# then weights will vary with level of exposure) | |
if (!is.null(oldq)){ | |
q = oldq | |
} | |
qx <- qdata[, expnms] | |
res <- list( | |
qx = qx, fit = msmfit$fit, msmfit = msmfit$msmfit, | |
psi = estb, var.psi = seb ^ 2, covmat.psi=covmat, ci = ci, | |
coef = estb, var.coef = seb ^ 2, covmat.coef=covmat, ci.coef = ci, | |
expnms=expnms, q=q, breaks=br, degree=degree, | |
pos.psi = NULL, neg.psi = NULL, | |
pos.weights = NULL, neg.weights = NULL, pos.size = NULL,neg.size = NULL, bootstrap=TRUE, | |
y.expected=msmfit$Ya, y.expectedmsm=msmfit$Yamsm, index=msmfit$A, | |
bootsamps = bootsamps, | |
cov.yhat=cov.yhat, | |
alpha=alpha, | |
call=origcall | |
) | |
if(msmfit$fit$family$family=='gaussian'){ | |
res$tstat <- tstat | |
res$df <- df | |
res$pval <- pval | |
} | |
if(msmfit$fit$family$family %in% c('binomial', 'poisson')){ | |
res$zstat <- tstat | |
res$pval <- pvalz | |
} | |
attr(res, "class") <- c("qgcompintfit", "qgcompfit") | |
res | |
} | |
qgcomp.noboot.noint <- function(f, | |
data, | |
expnms=NULL, | |
q=4, | |
breaks=NULL, | |
id=NULL, | |
weights, | |
alpha=0.05, | |
bayes=FALSE, | |
...){ | |
newform <- terms(f, data = data) | |
nobs = nrow(data) | |
origcall <- thecall <- match.call(expand.dots = FALSE) | |
names(thecall) <- gsub("f", "formula", names(thecall)) | |
m <- match(c("formula", "data", "weights", "offset"), names(thecall), 0L) | |
#m <- match(c("f", "data", "weights", "offset"), names(thecall), 0L) | |
hasweights = ifelse("weights" %in% names(thecall), TRUE, FALSE) | |
thecall <- thecall[c(1L, m)] | |
thecall$drop.unused.levels <- TRUE | |
thecall[[1L]] <- quote(stats::model.frame) | |
thecalle <- eval(thecall, parent.frame()) | |
if(hasweights){ | |
data$weights <- as.vector(model.weights(thecalle)) | |
} else data$weights = rep(1, nobs) | |
if (is.null(expnms)) { | |
#expnms <- attr(terms(f, data = data), "term.labels") | |
expnms <- attr(newform, "term.labels") | |
message("Including all model terms as exposures of interest\n") | |
} | |
lin = qgcomp:::checknames(expnms) | |
if(!lin) stop("Model appears to be non-linear: use qgcomp.boot instead") | |
if (!is.null(q) | !is.null(breaks)){ | |
ql <- quantize(data, expnms, q, breaks) | |
qdata <- ql$data | |
br <- ql$breaks | |
} else{ | |
qdata <- data | |
br <- breaks | |
} | |
if(is.null(id)) { | |
# not yet implemented | |
id = "id__" | |
qdata$id__ = seq_len(dim(qdata)[1]) | |
} | |
if(!bayes) fit <- glm(newform, data = qdata, | |
weights=weights, | |
...) | |
if(bayes){ | |
requireNamespace("arm") | |
fit <- bayesglm(newform, data = qdata, | |
weights=weights, | |
...) | |
} | |
#if(!bayes) fit <- glm(f, data = qdata[,!(names(qdata) %in% id), drop=FALSE], | |
# #weights=weights, | |
# ...) | |
#if(bayes){ | |
# requireNamespace("arm") | |
# fit <- bayesglm(f, data = qdata[,!(names(qdata) %in% id), drop=FALSE], | |
# #weights=weights, | |
# ...) | |
#} | |
mod <- summary(fit) | |
if(length(setdiff(expnms, rownames(mod$coefficients)))>0){ | |
stop("Model aliasing occurred, likely due to perfectly correlated quantized exposures. | |
Try one of the following: | |
1) set 'bayes' to TRUE in the qgcomp function (recommended) | |
2) set 'q' to a higher value in the qgcomp function (recommended) | |
3) check correlation matrix of exposures, and drop all but one variable in each highly correlated set (not recommended) | |
") | |
} | |
estb <- c(sum(mod$coefficients[expnms,1, drop=TRUE])) | |
seb <- c(qgcomp:::se_comb(expnms, covmat = mod$cov.scaled)) | |
tstat <- estb / seb | |
df <- mod$df.null - length(expnms) | |
pval <- 2 - 2 * pt(abs(tstat), df = df) | |
pvalz <- 2 - 2 * pnorm(abs(tstat)) | |
ci <- cbind(estb + seb * qnorm(alpha / 2), estb + seb * qnorm(1 - alpha / 2)) | |
# 'weights' | |
wcoef <- fit$coefficients[expnms] | |
names(wcoef) <- gsub("_q", "", names(wcoef)) | |
pos.coef <- which(wcoef > 0) | |
neg.coef <- which(wcoef <= 0) | |
pos.weights <- abs(wcoef[pos.coef]) / sum(abs(wcoef[pos.coef])) | |
neg.weights <- abs(wcoef[neg.coef]) / sum(abs(wcoef[neg.coef])) | |
# 'post-hoc' positive and negative estimators | |
# similar to constrained gWQS | |
pos.psi <- sum(wcoef[pos.coef]) | |
neg.psi <- sum(wcoef[neg.coef]) | |
#nmpos <- names(pos.weights) | |
#nmneg <- names(neg.weights) | |
#se.pos.psi <- se_comb(nmpos, covmat = mod$cov.scaled) | |
#se.neg.psi <- se_comb(nmneg, covmat = mod$cov.scaled) | |
qx <- qdata[, expnms] | |
names(qx) <- paste0(names(qx), "_q") | |
names(estb) <- c("psi1") | |
res <- list( | |
qx = qx, fit = fit, | |
psi = estb, var.psi = seb ^ 2, covmat.psi=c('psi1' = seb^2), ci = ci, | |
coef = estb, var.coef = seb ^ 2, | |
#covmat.coef=c('(Intercept)' = seb[1]^2, 'psi1' = seb[2]^2), | |
#covmat.coef=vc_comb(aname="(Intercept)", expnms=expnms, covmat = mod$cov.scaled), | |
covmat.coef=NULL, | |
ci.coef = ci, | |
expnms=expnms, q=q, breaks=br, degree=1, | |
pos.psi = pos.psi, neg.psi = neg.psi, | |
pos.weights = sort(pos.weights, decreasing = TRUE), | |
neg.weights = sort(neg.weights, decreasing = TRUE), | |
pos.size = sum(abs(wcoef[pos.coef])), | |
neg.size = sum(abs(wcoef[neg.coef])), | |
bootstrap=FALSE, | |
cov.yhat=NULL, | |
alpha=alpha, | |
call=origcall | |
) | |
if(fit$family$family=='gaussian'){ | |
res$tstat <- tstat | |
res$df <- df | |
res$pval <- pval | |
} | |
if(fit$family$family %in% c('binomial', 'poisson')){ | |
res$zstat <- tstat | |
res$pval <- pvalz | |
} | |
attr(res, "class") <- c("qgcompintfit", "qgcompfit") | |
res | |
} | |
#coef.qgcompintfit <- coef.qgcompfit | |
print.qgcompintfit <- function (x, showweights = TRUE, ...) | |
{ | |
fam <- x$fit$family$family | |
if (inherits(x, "ziqgcompfit")) { | |
stop("not implemented") | |
} | |
if (!is.null(x$pos.size) & showweights) { | |
cat(paste0("Scaled effect size (positive direction, sum of positive coefficients = ", | |
signif(x$pos.size, 3), ")\n")) | |
if (length(x$pos.weights) > 0) { | |
print(x$pos.weights, digits = 3) | |
} | |
else cat("None\n") | |
cat("\n") | |
} | |
if (!is.null(x$neg.size) & showweights) { | |
cat(paste0("Scaled effect size (negative direction, sum of negative coefficients = ", | |
signif(-x$neg.size, 3), ")\n")) | |
if (length(x$neg.weights) > 0) { | |
print(x$neg.weights, digits = 3) | |
} | |
else cat("None\n") | |
cat("\n") | |
} | |
if (fam == "binomial") { | |
estimand <- "OR" | |
if (x$bootstrap && x$msmfit$family$link == "log") | |
estimand = "RR" | |
cat(paste0("Mixture log(", estimand, ")", ifelse(x$bootstrap, | |
" (bootstrap CI)", " (Delta method CI)"), ":\n\n")) | |
testtype = "Z" | |
rnm = c(c(paste0("psi", 1:max(1, length(coef(x)))))) | |
} | |
if (fam == "poisson") { | |
estimand <- "RR" | |
cat(paste0("Mixture log(", estimand, ")", ifelse(x$bootstrap, | |
" (bootstrap CI)", " (Delta method CI)"), ":\n\n")) | |
testtype = "Z" | |
rnm = c(c(paste0("psi", 1:max(1, length(coef(x)))))) | |
} | |
if (fam == "gaussian") { | |
cat(paste0("Mixture slope parameters", ifelse(x$bootstrap, | |
" (bootstrap CI)", " (Delta method CI)"), ":\n\n")) | |
testtype = "t" | |
x$zstat = x$tstat | |
rnm = c(c(paste0("psi", 1:max(1, length(coef(x)))))) | |
} | |
if (fam == "cox") { | |
stop("not implemented") | |
} | |
if (!(fam %in% c("poisson", "binomial", "cox", "gaussian"))) { | |
warning(paste0("The ", fam, " distribution has not been tested with qgcomp! Please use with extreme caution\n and check results thoroughly with simulated data to ensure it works.")) | |
} | |
plab = ifelse(testtype == "Z", "Pr(>|z|)", "Pr(>|t|)") | |
if (is.null(dim(x$ci.coef))) { | |
pdat <- cbind(Estimate = coef(x), `Std. Error` = sqrt(x$var.coef), | |
`Lower CI` = x$ci.coef[1], `Upper CI` = x$ci.coef[2], | |
test = x$zstat, pval = x$pval) | |
} | |
else { | |
pdat <- cbind(Estimate = coef(x), `Std. Error` = sqrt(x$var.coef), | |
`Lower CI` = x$ci.coef[, 1], `Upper CI` = x$ci.coef[, | |
2], test = x$zstat, pval = x$pval) | |
} | |
colnames(pdat)[which(colnames(pdat) == "test")] = eval(paste(testtype, | |
"value")) | |
colnames(pdat)[which(colnames(pdat) == "pval")] = eval(paste(plab)) | |
rownames(pdat) <- rnm | |
printCoefmat(pdat, has.Pvalue = TRUE, tst.ind = 5L, signif.stars = FALSE, | |
cs.ind = 1L:2) | |
invisible(x) | |
} | |
# quick simulation examples | |
# true intercept is not zero: the methods should give different answers, suppressing intercept yields bias | |
dat <- qgcomp:::.dgm_quantized(N=200, b0=0, coef=c(.1, .1)) | |
dat$y = dat$y + 10 | |
lm(y ~ x1 + x2, data = dat) | |
lm(y ~ -1 + x1 + x2, data = dat) | |
(qf <- qgcomp.noboot(y ~ x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4)) | |
#(qfb <- qgcomp.noboot.noint(y ~ x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4)) # this will just report estimates without intercepts | |
(qfnoint <- qgcomp.noboot.noint(y ~ -1 + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4)) | |
(qfbnoint <- qgcomp.boot.noint(y ~ -1 + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4)) | |
# now when true intercept is zero, answers will be similar, suppressing intercept will improve precision | |
set.seed(1232) | |
dat <- qgcomp:::.dgm_quantized(N=500, b0=0, coef=c(.1, .1)) | |
lm(y ~ x1 + x2, data = dat) | |
lm(y ~ -1 + x1 + x2, data = dat) | |
(qf <- qgcomp.noboot(y ~ x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4)) | |
(qfnoint <- qgcomp.noboot.noint(y ~ -1 + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4)) | |
(qfbnoint <- qgcomp.boot.noint(y ~ -1 + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment