Skip to content

Instantly share code, notes, and snippets.

@alexpkeil1
Last active October 26, 2022 15:10
Show Gist options
  • Save alexpkeil1/d0e98db16bbdf2f294b21ea2a00c2f7a to your computer and use it in GitHub Desktop.
Save alexpkeil1/d0e98db16bbdf2f294b21ea2a00c2f7a to your computer and use it in GitHub Desktop.
Comparing overall effect estimates of quantile g-computation and weighted quantile sum, demonstrating that weights from qgcomp need to be handled carefully when compared across simulations
######################################################################################################################
# Author: Alex Keil
# Program: weight_bias.R
# Language: R
# Date: Friday, February 21, 2020 at 3:18:41 PM
# Project: qgcomp
# Description: # addressing a conjecture that WQS will yield better estimates of weights than qgcomp
# when the assumptions of WQS hold in sample sizes of 500 in a scenario in which
# exposures are all uncorrelated
#
# Notes: Data generating mechanism thanks to Drew Day
# Keywords:
# Released under the GNU General Public License: http://www.gnu.org/copyleft/gpl.html
######################################################################################################################
# a special version of gwqs that doesn't give errors when run with lazily evaluated mix_name parameter in gwqs function
devtools::install_github("alexpkeil1/gWQS2b")
library(gWQS2b)
library(qgcomp)
library(future)
library(future.apply)
library(readr)
# simulations
# Let’s say you have a simulated dataset with 500 observations,
# 10 quintile-transformed mixture components with unidirectional
# true weights {0.15, 0.15, 0.15, 0.15, 0.15, 0.05, 0.05, 0.05, 0.05, 0.05}
# adding up to a cumulative mixture coefficient of 0.3 and 10 covariates,
# 5 with random unit normal deviates for coefficients and 5 with null coefficients.
# I give an uncorrelated structure to the mixture components and covariates, a model
# intercept term of 2, and an overall error term for the model that is randomly unit
# normally distributed
dgm_quantized <- function(
N = 100, # sample size
b0=2, # baseline expected outcome (model intercept)
# beta coefficients for X in the outcome model
coefX=c(0.15, 0.15, 0.15, 0.15, 0.15, 0.05, 0.05, 0.05, 0.05, 0.05)*0.3,
coefZ = c(rnorm(5), rep(0, 5)),
q = 5
){
# simulate under data structure where WQS/qgcomp is the truth:
# e.g. a multivariate exposure with multinomial distribution
# and an outcome that is a linear function of exposure scores
px = length(coefX)
pz = length(coefZ)
X = matrix(nrow=N, ncol=px)
Z = matrix(rnorm(N*pz), nrow=N, ncol=pz)
X = sapply(1:px, function(x) sample(rep(0:(q-1), length.out=N), N, replace=FALSE))
#X = vapply(1:px, function(x) sample(rep(0:(q-1), length.out=N), N, replace=FALSE))
mu <- X %*% coefX + Z %*% coefZ
y = rnorm(N) + mu
colnames(X) <- paste0("x", 1:px)
colnames(Z) <- paste0("z", 1:pz)
data.frame(Z,X,y)
}
# comparing weights, method 1
analysis = function(i, ..., nm="weightbias.csv"){
set.seed(i)
dat = dgm_quantized(...)
px = length(grep("x[0-9]", names(dat)))
exposures = paste0("x",1:px)
if(exists("wqsfit")) rm(list=c('wqsfit'))
if(exists("wqsfit.nosplit")) rm(list=c('wqsfit.nosplit'))
# note if you want to use the original version of gwqs version 2.0,
# you can just explicitly replace "exposures" with c("x1", "x2", ...) in the calls to gwqs
set.seed(i)
suppressMessages(qgcfit <- qgcomp.noboot(y ~ ., expnms=exposures, data = dat, q=NULL, family = "gaussian"))
set.seed(i)
try(suppressWarnings(wqsfit <- gWQS2b::gwqs(y ~ wqs + z1+z2+z3+z4+z5+z6+z7+z8+z9+z10, mix_name=exposures, q=NULL, data = dat, family = "gaussian")))
set.seed(i)
try(suppressWarnings(wqsfit.nosplit <- gWQS2b::gwqs(y ~ wqs + + z1+z2+z3+z4+z5+z6+z7+z8+z9+z10, mix_name=exposures, q=NULL, validation = 0.0, data = dat, family = "gaussian")))
out = c(
iter = i,
# overall estimates
est.qgc = as.numeric(qgcfit$psi),
# positive/negative psi from qgcomp (can be used to renormalize and compare weights)
posest.qgc = qgcfit$pos.psi,
negest.qgc = qgcfit$neg.psi,
# p values for overall effect
p.qgc = qgcfit$pval[2],
# weights
wts.qgc = c(qgcfit$pos.weights, -qgcfit$neg.weights)[exposures],
# beta coefficients (or w*psi for was)
coef.qgc = qgcfit$fit$coefficients[exposures]
)
# wqs can fail to produce an estimate, so this just fills in NAs when that's the case
if(exists("wqsfit")){
out = c(out,
est.wqs = as.numeric(wqsfit$fit$coefficients[2]),
p.wqs = summary(wqsfit$fit)$coefficients[2,4],
wts.wqs = wqsfit$final_weights[exposures,2],
coef.wqs = wqsfit$final_weights[exposures,2] * as.numeric(wqsfit$fit$coefficients[2])
)
} else out = c(out, rep(NA, px*2+2))
if(exists("wqsfit.nosplit")){
out = c(out,
est.wqs.nosplit = as.numeric(wqsfit.nosplit$fit$coefficients[2]),
p.wqs.nosplit = summary(wqsfit.nosplit$fit)$coefficients[2,4],
wts.wqs.nosplit = wqsfit.nosplit$final_weights[exposures,2],
coef.wqs.nosplit = wqsfit.nosplit$final_weights[exposures,2] * as.numeric(wqsfit.nosplit$fit$coefficients[2])
)
} else out = c(out, rep(NA, px*2+2))
write_csv(data.frame(t(out)), path = nm, append = TRUE)
out
}
summarize.sims <- function(res, truepsi, trueweights){
# keep only the iterations where wqs produced an estimate
res = t(t(res)[complete.cases(t(res)),])
truecoef = truepsi*trueweights
#indexes
psiindex = grep("^est", rownames(res))
#posindex = grep("^posest", rownames(res))
#negindex = grep("^negest", rownames(res))
pvalindex = grep("^p.", rownames(res))
wt_qgcindex = grep("^wts.qgc", rownames(res))
wt_wqsindex = grep("^wts.wqs[0-9]+", rownames(res))
wt_wqsnsindex = grep("^wts.wqs.nosplit", rownames(res))
coef_qgcindex = grep("^coef.qgc", rownames(res))
coef_wqsindex = grep("^coef.wqs[0-9]+", rownames(res))
coef_wqsnsindex = grep("^coef.wqs.nosplit", rownames(res))
# I wasn't sure how weights were being compared across runs in qgcomp, so I tried a couple
# of ways (there is no guarantee that the weights will be positive in finite samples in qgcomp)
# gqcomp weight method 0: just use the weights (wrong way, some are negative)
qgcc.w0 = abs(res[wt_qgcindex,])
# gqcomp weight method 1: assume zero if negative
# (better - gets at what the weights actually estimate, which is the proportion of the
# effect in the pre-specified direction)
qgcc.w1 = t(apply(res[wt_qgcindex,], 1, function(x) ifelse(x<0, 0, x)))
# gqcomp weight method 2: ? any other schemes to try ?
#wqs weights
wqs.wt = res[wt_wqsindex,]
wqs.wtns = res[wt_wqsnsindex,]
# coefficients
qgc.coef = res[coef_qgcindex,]
wqs.coef = res[coef_wqsindex,]
wqsns.coef = res[coef_wqsnsindex,]
list(
bias_psi = apply(res[psiindex,] - truepsi, 1, mean),
power = apply(res[pvalindex,]<0.05, 1, mean),
# bias: E(E_px(est - truth))
wt.bias = c(
qgc.wrong = mean(apply(qgcc.w0-trueweights, 2, mean)),
qgc.0 = mean(apply(qgcc.w1-trueweights, 2, mean)),
wqs = mean(apply(wqs.wt-trueweights, 2, mean)),
wqsns = mean(apply(wqs.wtns-trueweights, 2, mean))
),
# absolute bias: E(E_px(abs(est - truth)))
wt.absbias = c(
qgc.wrong = mean(apply(abs(qgcc.w0-trueweights), 2, mean)),
qgc.0 = mean(apply(abs(qgcc.w1-trueweights), 2, mean)),
wqs = mean(apply(abs(wqs.wt-trueweights), 2, mean)),
wqsns = mean(apply(abs(wqs.wtns-trueweights), 2, mean))
),
# squared bias, (mse under known variance): E(E_px((est - truth)^2))
wt.sqbias = c(
qgc.wrong = mean(apply(abs(qgcc.w0-trueweights)^2, 2, mean)),
qgc.0 = mean(apply(abs(qgcc.w1-trueweights)^2, 2, mean)),
wqs = mean(apply(abs(wqs.wt-trueweights)^2, 2, mean)),
wqsns = mean(apply(abs(wqs.wtns-trueweights)^2, 2, mean))
),
# bias: effect size
beta.bias = c(
qgc = mean(apply(qgc.coef-truecoef, 2, mean)),
wqs = mean(apply(wqs.coef-truecoef, 2, mean)),
wqsns = mean(apply(wqsns.coef-truecoef, 2, mean))
),
# absolute bias: effect size
beta.absbias = c(
qgc = mean(apply(abs(qgc.coef-truecoef), 2, mean)),
wqs = mean(apply(abs(wqs.coef-truecoef), 2, mean)),
wqsns = mean(apply(abs(wqsns.coef-truecoef), 2, mean))
),
# squared bias, (mse under known variance): E(E_px((est - truth)^2))
beta.sqbias = c(
qgc = mean(apply(abs(qgc.coef-truecoef)^2, 2, mean)),
wqs = mean(apply(abs(wqs.coef-truecoef)^2, 2, mean)),
wqsns = mean(apply(abs(wqsns.coef-truecoef)^2, 2, mean))
)
)
}
# original simulation model
trueweights = c(0.15, 0.15, 0.15, 0.15, 0.15, 0.05, 0.05, 0.05, 0.05, 0.05)
truepsi = 0.3
coefZ = c(rnorm(5), rep(0, 5)) # keep these fixed across simulations
# example data
dat <- dgm_quantized(
N = 100, # sample size
b0=2, # baseline expected outcome (model intercept)
coefX=trueweights*truepsi, # beta coefficients for exposures = weights*overall effect
coefZ = coefZ, # coefficients for covariates
q = 5 # quintiles
)
# there is a slight variation between my simulation and original (I think?)
# DD writes: "only the error term varying each repetition"
# whereas in my simulation the exposures and covariates also vary randomly according
# to the data generating mechanism. This renders the simulation less prone to
# simulation error
plan(multisession)
rr = NULL
set.seed(1232)
# perform simulation 500 times at sample sizes of 500
resnm = "weightbias.csv"# WQS is slow, so output results at each iteration to check progress
file.create(resnm)
res = future_sapply(1:500, analysis, N=500, coefX=trueweights*truepsi, coefZ=coefZ, nm=resnm)
# looking at bias in overall effect, weights, and individual effect sizes
print(summarize.sims(res, truepsi=truepsi, trueweights=trueweights))
# quantile g-computation is unbiased for the overall effect, whereas wqs is biased with and without sample splitting
# $bias_psi
# est.qgc est.wqs est.wqs.nosplit
# -0.006866737 -0.093233670 0.052307952
# quantile g-computation has higher power than vanilla wqs approach for the overall effect, but "no split" has highest power
# $power
# posest.qgc p.qgc p.wqs p.wqs.nosplit
# 0.000 0.822 0.484 0.992
# all methods of estimating the weights are approximately unbiased on average for the weights, with infinitesimal advantage for qgcomp
# $wt.bias
# qgc.wrong qgc.0 wqs wqsns
# 9.000000e-02 1.757662e-18 8.470955e-18 2.799159e-18
# for the weights, quantile g-computation has higher *absolute* bias (MAE) and *squared* bias (MSE) than either wqs approach for the overall effect
# $wt.absbias
# qgc.wrong qgc.0 wqs wqsns
# 0.12699143 0.06341925 0.06176512 0.05183476
# $wt.sqbias
# qgc.wrong qgc.0 wqs wqsns
# 0.054329817 0.006382865 0.006235672 0.004408613
# for the underlying beta coefficients, quantile g-computation has lower bias than either wqs approach for the overall effect
# $beta.bias
# qgc wqs wqsns
# -0.0006866737 -0.0093233670 0.0052307952
# for the underlying beta coefficients, quantile g-computation has higher *absolute* bias (MAE) and *squared* bias (MSE) than either wqs approach for the overall effect
# $beta.absbias
# qgc wqs wqsns
# 0.02548078 0.01941965 0.01875611
# $beta.sqbias
# qgc wqs wqsns
# 0.0010313698 0.0005854840 0.0006037571
@alexpkeil1
Copy link
Author

Note: the original program here had an error where WQS models were not adjusted for covariates. While the covariates are not confounders, in the linear model setting they will reduce some residual error in the estimates and lead to higher precision. This changed findings regarding the ranking of WQS methods and qgcomp methods for this data generating mechanism in terms of mean absolute bias and mean squared bias (but not bias).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment