Skip to content

Instantly share code, notes, and snippets.

@alexpkeil1
Last active July 1, 2021 17:18
Show Gist options
  • Save alexpkeil1/ff169e6933de2bd2f302f1fd6423e74e to your computer and use it in GitHub Desktop.
Save alexpkeil1/ff169e6933de2bd2f302f1fd6423e74e to your computer and use it in GitHub Desktop.
quantile g-computation with a suppressed intercept
library(qgcomp)
msm.fit.noint <- function(f,
qdata,
intvals,
expnms,
rr=TRUE,
main=TRUE,
degree=1,
id=NULL,
weights,
bayes=FALSE,
MCsize=nrow(qdata), ...){
newform <- terms(f, data = qdata)
nobs = nrow(qdata)
thecall <- match.call(expand.dots = FALSE)
names(thecall) <- gsub("qdata", "data", names(thecall))
names(thecall) <- gsub("f", "formula", names(thecall))
m <- match(c("formula", "data", "weights", "offset"), names(thecall), 0L)
hasweights = ifelse("weights" %in% names(thecall), TRUE, FALSE)
thecall <- thecall[c(1L, m)]
thecall$drop.unused.levels <- TRUE
thecall[[1L]] <- quote(stats::model.frame)
thecalle <- eval(thecall, parent.frame())
if(hasweights){
qdata$weights <- as.vector(model.weights(thecalle))
} else qdata$weights = rep(1, nobs)
if(is.null(id)) {
id <- "id__"
qdata$id__ <- seq_len(dim(qdata)[1])
}
# conditional outcome regression fit
nidx = which(!(names(qdata) %in% id))
if(!bayes) fit <- glm(newform, data = qdata,
weights=weights,
...)
if(bayes){
requireNamespace("arm")
fit <- bayesglm(f, data = qdata[,nidx,drop=FALSE],
weights=weights,
...)
}
if(fit$family$family %in% c("gaussian", "poisson")) rr=FALSE
###
# get predictions (set exposure to 0,1,...,q-1)
if(is.null(intvals)){
intvals <- (seq_len(length(table(qdata[expnms[1]])))) - 1
}
predit <- function(idx, newdata){
#newdata <- qdata
newdata[,expnms] <- idx
suppressWarnings(predict(fit, newdata=newdata, type='response'))
}
if(MCsize==nrow(qdata)){
newdata <- qdata
}else{
newids <- data.frame(temp=sort(sample(unique(qdata[,id, drop=TRUE]), MCsize,
#probs=weights, #bootstrap sampling with weights works with fixed weights, but not time-varying weights
replace = TRUE
)))
names(newids) <- id
newdata <- merge(qdata,newids, by=id, all.x=FALSE, all.y=TRUE)[seq_len(MCsize),]
}
predmat = lapply(intvals, predit, newdata=newdata)
# fit MSM using g-computation estimates of expected outcomes under joint
# intervention
#nobs <- dim(qdata)[1]
msmdat <- data.frame(
cbind(
Ya = unlist(predmat),
psi = rep(intvals, each=MCsize),
weights = rep(newdata$weights, times=length(intvals))
#times=length(table(qdata[expnms[1]])))
)
)
# to do: allow functional form variations for the MSM via specifying the model formula
if(bayes){
if(!rr) suppressWarnings(msmfit <- bayesglm(Ya ~ -1 + poly(psi, degree=degree, raw=TRUE), data=msmdat,
weights=weights, x=TRUE,
...))
if(rr) suppressWarnings(msmfit <- bayesglm(Ya ~ -1 + poly(psi, degree=degree, raw=TRUE), data=msmdat,
family=binomial(link='log'), start=rep(-0.0001, degree+1),
weights=weights, x=TRUE))
}
if(!bayes){
if(!rr) suppressWarnings(msmfit <- glm(Ya ~ -1 + poly(psi, degree=degree, raw=TRUE), data=msmdat,
weights=weights, x=TRUE,
...))
if(rr) suppressWarnings(msmfit <- glm(Ya ~ -1 + poly(psi, degree=degree, raw=TRUE), data=msmdat,
family=binomial(link='log'), start=rep(-0.0001, degree+1),
weights=weights, x=TRUE))
}
res <- list(fit=fit, msmfit=msmfit)
if(main) {
res$Ya <- msmdat$Ya # expected outcome under joint exposure, by gcomp
res$Yamsm <- predict(msmfit, type='response')
res$Yamsml <- predict(msmfit, type="link")
res$A <- msmdat$psi # joint exposure (0 = all exposures set category with
# upper cut-point as first quantile)
}
res
}
qgcomp.boot.noint <- function(f,
data,
expnms=NULL,
q=4,
breaks=NULL,
id=NULL,
weights,
alpha=0.05,
B=200,
rr=TRUE,
degree=1,
seed=NULL,
bayes=FALSE,
MCsize=nrow(data),
parallel=FALSE, ...){
oldq = NULL
if(is.null(seed)) seed = round(runif(1, min=0, max=1e8))
newform <- terms(f, data = data)
class(newform) <- "formula"
nobs = nrow(data)
origcall <- thecall <- match.call(expand.dots = FALSE)
names(thecall) <- gsub("f", "formula", names(thecall))
m <- match(c("formula", "data", "weights", "offset"), names(thecall), 0L)
hasweights = ifelse("weights" %in% names(thecall), TRUE, FALSE)
thecall <- thecall[c(1L, m)]
thecall$drop.unused.levels <- TRUE
thecall[[1L]] <- quote(stats::model.frame)
thecalle <- eval(thecall, parent.frame())
if(hasweights){
data$weights <- as.vector(model.weights(thecalle))
} else data$weights = rep(1, nobs)
if (is.null(expnms)) {
#expnms <- attr(terms(f, data = data), "term.labels")
expnms <- attr(newform, "term.labels")
message("Including all model terms as exposures of interest\n")
}
lin = qgcomp:::checknames(expnms)
if(!lin) stop("Model appears to be non-linear and I'm having trouble parsing it:
please use `expnms` parameter to define the variables making up the exposure")
if (!is.null(q) & !is.null(breaks)){
# if user specifies breaks, prioritize those
oldq = q
q <- NULL
}
if (!is.null(q) | !is.null(breaks)){
ql <- quantize(data, expnms, q, breaks)
qdata <- ql$data
br <- ql$breaks
if(is.null(q)){
# rare scenario with user specified breaks and q is left at NULL
nvals <- length(br[[1]])-1
} else{
nvals <- q
}
intvals <- (seq_len(nvals))-1
} else {
# if( is.null(breaks) & is.null(q)) # also includes NA
qdata <- data
# if no transformation is made (no quantiles, no breaks given)
# then draw distribution values from quantiles of all the exposures
# pooled together
# : allow user specification of this
nvals = length(table(unlist(data[,expnms])))
if(nvals < 10){
message("\nNote: using all possible values of exposure as the
intervention values\n")
p = length(expnms)
intvals <- as.numeric(names(table(unlist(data[,expnms]))))
br <- lapply(seq_len(p), function(x) c(-1e16, intvals[2:nvals]-1e-16, 1e16))
}else{
message("\nNote: using quantiles of all exposures combined in order to set
proposed intervention values for overall effect (25th, 50th, 75th %ile)
You can ensure this is valid by scaling all variables in expnms to have similar ranges.")
intvals = as.numeric(quantile(unlist(data[,expnms]), c(.25, .5, .75)))
br <- NULL
}
}
if(is.null(id)) {
id <- "id__"
qdata$id__ <- seq_len(dim(qdata)[1])
}
###
msmfit <- msm.fit.noint(newform, qdata, intvals, expnms, rr, main=TRUE,degree=degree, id=id,
weights,
bayes,
MCsize=MCsize,
...)
# main estimate
#estb <- as.numeric(msmfit$msmfit$coefficients[-1])
estb <- as.numeric(msmfit$msmfit$coefficients)
#bootstrap to get std. error
nobs <- dim(qdata)[1]
nids <- length(unique(qdata[,id, drop=TRUE]))
starttime = Sys.time()
psi.only <- function(i=1, f=f, qdata=qdata, intvals=intvals, expnms=expnms, rr=rr, degree=degree,
nids=nids, id=id,
weights,MCsize=MCsize,
...){
if(i==2 & !parallel){
timeiter = as.numeric(Sys.time() - starttime)
if((timeiter*B/60)>0.5) message(paste0("Expected time to finish: ", round(B*timeiter/60, 2), " minutes \n"))
}
bootids <- data.frame(temp=sort(sample(unique(qdata[,id, drop=TRUE]), nids,
replace = TRUE
)))
names(bootids) <- id
qdata_ <- merge(qdata,bootids, by=id, all.x=FALSE, all.y=TRUE)
ft = msm.fit.noint(f, qdata_, intvals, expnms, rr, main=FALSE, degree, id, weights=weights, bayes, MCsize=MCsize,
...)
yhatty = data.frame(yhat=predict(ft$msmfit), psi=ft$msmfit$data[,"psi"])
as.numeric(
# the yhat estimates will be identical across individuals due to this being a marginal model
c(ft$msmfit$coefficients, with(yhatty, tapply(yhat, psi, mean)))
)
}
set.seed(seed)
if(parallel){
#Sys.setenv(R_FUTURE_SUPPORTSMULTICORE_UNSTABLE="quiet")
future::plan(strategy = future::multisession)
bootsamps <- future.apply::future_lapply(X=seq_len(B), FUN=psi.only,f=f, qdata=qdata, intvals=intvals,
expnms=expnms, rr=rr, degree=degree, nids=nids, id=id,
weights=qdata$weights,MCsize=MCsize,
future.seed=TRUE,
...)
future::plan(strategy = future::transparent)
}else{
bootsamps <- lapply(X=seq_len(B), FUN=psi.only,f=f, qdata=qdata, intvals=intvals,
expnms=expnms, rr=rr, degree=degree, nids=nids, id=id,
weights=weights, MCsize=MCsize,
...)
}
bootsamps = do.call("cbind", bootsamps)
# these are the linear predictors
hats = t(bootsamps[-c(seq_len(degree)),])
# covariance of the linear predictors
cov.yhat = cov(hats)
bootsamps = bootsamps[seq_len(degree),, drop=FALSE]
seb <- apply(bootsamps, 1, sd)
covmat <- cov(t(bootsamps))
colnames(covmat) <- rownames(covmat) <- names(estb) <- c(paste0("psi", seq_len(nrow(bootsamps))))
tstat <- estb / seb
df <- nobs - length(attr(terms(f, data = data), "term.labels")) - 1 - degree # df based on obs - gcomp terms - msm terms
pval <- 2 - 2 * pt(abs(tstat), df = df)
pvalz <- 2 - 2 * pnorm(abs(tstat))
ci <- cbind(estb + seb * qnorm(alpha / 2), estb + seb * qnorm(1 - alpha / 2))
# outcome 'weights' not applicable in this setting, generally (i.e. if using this function for non-linearity,
# then weights will vary with level of exposure)
if (!is.null(oldq)){
q = oldq
}
qx <- qdata[, expnms]
res <- list(
qx = qx, fit = msmfit$fit, msmfit = msmfit$msmfit,
psi = estb, var.psi = seb ^ 2, covmat.psi=covmat, ci = ci,
coef = estb, var.coef = seb ^ 2, covmat.coef=covmat, ci.coef = ci,
expnms=expnms, q=q, breaks=br, degree=degree,
pos.psi = NULL, neg.psi = NULL,
pos.weights = NULL, neg.weights = NULL, pos.size = NULL,neg.size = NULL, bootstrap=TRUE,
y.expected=msmfit$Ya, y.expectedmsm=msmfit$Yamsm, index=msmfit$A,
bootsamps = bootsamps,
cov.yhat=cov.yhat,
alpha=alpha,
call=origcall
)
if(msmfit$fit$family$family=='gaussian'){
res$tstat <- tstat
res$df <- df
res$pval <- pval
}
if(msmfit$fit$family$family %in% c('binomial', 'poisson')){
res$zstat <- tstat
res$pval <- pvalz
}
attr(res, "class") <- c("qgcompintfit", "qgcompfit")
res
}
qgcomp.noboot.noint <- function(f,
data,
expnms=NULL,
q=4,
breaks=NULL,
id=NULL,
weights,
alpha=0.05,
bayes=FALSE,
...){
newform <- terms(f, data = data)
nobs = nrow(data)
origcall <- thecall <- match.call(expand.dots = FALSE)
names(thecall) <- gsub("f", "formula", names(thecall))
m <- match(c("formula", "data", "weights", "offset"), names(thecall), 0L)
#m <- match(c("f", "data", "weights", "offset"), names(thecall), 0L)
hasweights = ifelse("weights" %in% names(thecall), TRUE, FALSE)
thecall <- thecall[c(1L, m)]
thecall$drop.unused.levels <- TRUE
thecall[[1L]] <- quote(stats::model.frame)
thecalle <- eval(thecall, parent.frame())
if(hasweights){
data$weights <- as.vector(model.weights(thecalle))
} else data$weights = rep(1, nobs)
if (is.null(expnms)) {
#expnms <- attr(terms(f, data = data), "term.labels")
expnms <- attr(newform, "term.labels")
message("Including all model terms as exposures of interest\n")
}
lin = qgcomp:::checknames(expnms)
if(!lin) stop("Model appears to be non-linear: use qgcomp.boot instead")
if (!is.null(q) | !is.null(breaks)){
ql <- quantize(data, expnms, q, breaks)
qdata <- ql$data
br <- ql$breaks
} else{
qdata <- data
br <- breaks
}
if(is.null(id)) {
# not yet implemented
id = "id__"
qdata$id__ = seq_len(dim(qdata)[1])
}
if(!bayes) fit <- glm(newform, data = qdata,
weights=weights,
...)
if(bayes){
requireNamespace("arm")
fit <- bayesglm(newform, data = qdata,
weights=weights,
...)
}
#if(!bayes) fit <- glm(f, data = qdata[,!(names(qdata) %in% id), drop=FALSE],
# #weights=weights,
# ...)
#if(bayes){
# requireNamespace("arm")
# fit <- bayesglm(f, data = qdata[,!(names(qdata) %in% id), drop=FALSE],
# #weights=weights,
# ...)
#}
mod <- summary(fit)
if(length(setdiff(expnms, rownames(mod$coefficients)))>0){
stop("Model aliasing occurred, likely due to perfectly correlated quantized exposures.
Try one of the following:
1) set 'bayes' to TRUE in the qgcomp function (recommended)
2) set 'q' to a higher value in the qgcomp function (recommended)
3) check correlation matrix of exposures, and drop all but one variable in each highly correlated set (not recommended)
")
}
estb <- c(sum(mod$coefficients[expnms,1, drop=TRUE]))
seb <- c(qgcomp:::se_comb(expnms, covmat = mod$cov.scaled))
tstat <- estb / seb
df <- mod$df.null - length(expnms)
pval <- 2 - 2 * pt(abs(tstat), df = df)
pvalz <- 2 - 2 * pnorm(abs(tstat))
ci <- cbind(estb + seb * qnorm(alpha / 2), estb + seb * qnorm(1 - alpha / 2))
# 'weights'
wcoef <- fit$coefficients[expnms]
names(wcoef) <- gsub("_q", "", names(wcoef))
pos.coef <- which(wcoef > 0)
neg.coef <- which(wcoef <= 0)
pos.weights <- abs(wcoef[pos.coef]) / sum(abs(wcoef[pos.coef]))
neg.weights <- abs(wcoef[neg.coef]) / sum(abs(wcoef[neg.coef]))
# 'post-hoc' positive and negative estimators
# similar to constrained gWQS
pos.psi <- sum(wcoef[pos.coef])
neg.psi <- sum(wcoef[neg.coef])
#nmpos <- names(pos.weights)
#nmneg <- names(neg.weights)
#se.pos.psi <- se_comb(nmpos, covmat = mod$cov.scaled)
#se.neg.psi <- se_comb(nmneg, covmat = mod$cov.scaled)
qx <- qdata[, expnms]
names(qx) <- paste0(names(qx), "_q")
names(estb) <- c("psi1")
res <- list(
qx = qx, fit = fit,
psi = estb, var.psi = seb ^ 2, covmat.psi=c('psi1' = seb^2), ci = ci,
coef = estb, var.coef = seb ^ 2,
#covmat.coef=c('(Intercept)' = seb[1]^2, 'psi1' = seb[2]^2),
#covmat.coef=vc_comb(aname="(Intercept)", expnms=expnms, covmat = mod$cov.scaled),
covmat.coef=NULL,
ci.coef = ci,
expnms=expnms, q=q, breaks=br, degree=1,
pos.psi = pos.psi, neg.psi = neg.psi,
pos.weights = sort(pos.weights, decreasing = TRUE),
neg.weights = sort(neg.weights, decreasing = TRUE),
pos.size = sum(abs(wcoef[pos.coef])),
neg.size = sum(abs(wcoef[neg.coef])),
bootstrap=FALSE,
cov.yhat=NULL,
alpha=alpha,
call=origcall
)
if(fit$family$family=='gaussian'){
res$tstat <- tstat
res$df <- df
res$pval <- pval
}
if(fit$family$family %in% c('binomial', 'poisson')){
res$zstat <- tstat
res$pval <- pvalz
}
attr(res, "class") <- c("qgcompintfit", "qgcompfit")
res
}
#coef.qgcompintfit <- coef.qgcompfit
print.qgcompintfit <- function (x, showweights = TRUE, ...)
{
fam <- x$fit$family$family
if (inherits(x, "ziqgcompfit")) {
stop("not implemented")
}
if (!is.null(x$pos.size) & showweights) {
cat(paste0("Scaled effect size (positive direction, sum of positive coefficients = ",
signif(x$pos.size, 3), ")\n"))
if (length(x$pos.weights) > 0) {
print(x$pos.weights, digits = 3)
}
else cat("None\n")
cat("\n")
}
if (!is.null(x$neg.size) & showweights) {
cat(paste0("Scaled effect size (negative direction, sum of negative coefficients = ",
signif(-x$neg.size, 3), ")\n"))
if (length(x$neg.weights) > 0) {
print(x$neg.weights, digits = 3)
}
else cat("None\n")
cat("\n")
}
if (fam == "binomial") {
estimand <- "OR"
if (x$bootstrap && x$msmfit$family$link == "log")
estimand = "RR"
cat(paste0("Mixture log(", estimand, ")", ifelse(x$bootstrap,
" (bootstrap CI)", " (Delta method CI)"), ":\n\n"))
testtype = "Z"
rnm = c(c(paste0("psi", 1:max(1, length(coef(x))))))
}
if (fam == "poisson") {
estimand <- "RR"
cat(paste0("Mixture log(", estimand, ")", ifelse(x$bootstrap,
" (bootstrap CI)", " (Delta method CI)"), ":\n\n"))
testtype = "Z"
rnm = c(c(paste0("psi", 1:max(1, length(coef(x))))))
}
if (fam == "gaussian") {
cat(paste0("Mixture slope parameters", ifelse(x$bootstrap,
" (bootstrap CI)", " (Delta method CI)"), ":\n\n"))
testtype = "t"
x$zstat = x$tstat
rnm = c(c(paste0("psi", 1:max(1, length(coef(x))))))
}
if (fam == "cox") {
stop("not implemented")
}
if (!(fam %in% c("poisson", "binomial", "cox", "gaussian"))) {
warning(paste0("The ", fam, " distribution has not been tested with qgcomp! Please use with extreme caution\n and check results thoroughly with simulated data to ensure it works."))
}
plab = ifelse(testtype == "Z", "Pr(>|z|)", "Pr(>|t|)")
if (is.null(dim(x$ci.coef))) {
pdat <- cbind(Estimate = coef(x), `Std. Error` = sqrt(x$var.coef),
`Lower CI` = x$ci.coef[1], `Upper CI` = x$ci.coef[2],
test = x$zstat, pval = x$pval)
}
else {
pdat <- cbind(Estimate = coef(x), `Std. Error` = sqrt(x$var.coef),
`Lower CI` = x$ci.coef[, 1], `Upper CI` = x$ci.coef[,
2], test = x$zstat, pval = x$pval)
}
colnames(pdat)[which(colnames(pdat) == "test")] = eval(paste(testtype,
"value"))
colnames(pdat)[which(colnames(pdat) == "pval")] = eval(paste(plab))
rownames(pdat) <- rnm
printCoefmat(pdat, has.Pvalue = TRUE, tst.ind = 5L, signif.stars = FALSE,
cs.ind = 1L:2)
invisible(x)
}
# quick simulation examples
# true intercept is not zero: the methods should give different answers, suppressing intercept yields bias
dat <- qgcomp:::.dgm_quantized(N=200, b0=0, coef=c(.1, .1))
dat$y = dat$y + 10
lm(y ~ x1 + x2, data = dat)
lm(y ~ -1 + x1 + x2, data = dat)
(qf <- qgcomp.noboot(y ~ x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4))
#(qfb <- qgcomp.noboot.noint(y ~ x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4)) # this will just report estimates without intercepts
(qfnoint <- qgcomp.noboot.noint(y ~ -1 + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4))
(qfbnoint <- qgcomp.boot.noint(y ~ -1 + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4))
# now when true intercept is zero, answers will be similar, suppressing intercept will improve precision
set.seed(1232)
dat <- qgcomp:::.dgm_quantized(N=500, b0=0, coef=c(.1, .1))
lm(y ~ x1 + x2, data = dat)
lm(y ~ -1 + x1 + x2, data = dat)
(qf <- qgcomp.noboot(y ~ x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4))
(qfnoint <- qgcomp.noboot.noint(y ~ -1 + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4))
(qfbnoint <- qgcomp.boot.noint(y ~ -1 + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment