alexpkeil1/conjuring_effects_in_wqs.R

## conjuring_effects_in_wqs.R
# Example in which weighted quantile sums estimates a significant, positive effect of exposure due to pure confounding
#  even though there are no exposures that have a positive effect on the outcome


library(qgcomp)
library(gWQS)

dgm <- function(N = 100, b0=0, coef=c(1,0,0,0), family=gaussian(), ncor=0, corr=0.75, umc=FALSE){
  # simulate under data structure where WQS is the truth: e.g. an exposure with
  # multinomial distribution
  p = length(coef)
  if(ncor >= p) ncor = p-1
  X = matrix(nrow=N, ncol=p)
  colnames(X) <- paste0("x", 1:p)
  xmaster = sample(rep(0:3, length.out=N), N, replace=FALSE)
  for(k in 1:p){
    newx = numeric(N)
    c1 = as.logical(rbinom(N, 1, sqrt(corr)))
    newx[which(c1)] = xmaster[which(c1)]
    newx[which(!c1)] = sample(xmaster[which(!c1)])
    if(k<=(ncor+1)){
      X[,k] = newx
    } else X[,k] = sample(xmaster)
  }
  if(!umc) C = 0
  if(umc) {
    newx = numeric(N)
    c1 = as.logical(rbinom(N, 1, sqrt(corr)))
    newx[which(c1)] = X[,1][which(c1)]
    newx[which(!c1)] = sample(X[,1][which(!c1)])
    C = newx
  }
  py = b0 + X %*% coef + C * 0.5
  if(family$family=='binomial') y = rbinom(N, 1, 1/(1+exp(-py)))
  if(family$family=='gaussian') y = rnorm(N) + py
  list(dat=data.frame(X,y), family=family, C=C)
}


set.seed(123)
# simulate so there is one causal effect in the negative direction that is correlated with a
# single, non-causal exposure
dat = dgm(N = 500, coef=c(0, 1, 0, 0, 0), ncor = 1, corr = .5, family=gaussian())
dat$dat$x2=3-dat$dat$x2  # a trick to reverse the effect
cor(dat$dat) # x2 is negatively correlated with X1 and negatively correlated with Y (which induces a positive bivariate correlation between X and Y)
#           x1        x2           x3          x4          x5            y
#x1  1.0000000 -0.478400 -0.075200000 -0.02080000  0.01760000  0.339131386
#x2 -0.4784000  1.000000  0.016000000  0.02880000  0.06080000 -0.730573991
#x3 -0.0752000  0.016000  1.000000000  0.05920000 -0.02080000 -0.002838093
#x4 -0.0208000  0.028800  0.059200000  1.00000000 -0.03680000 -0.013095436
#x5  0.0176000  0.060800 -0.020800000 -0.03680000  1.00000000  0.013773360
#y   0.3391314 -0.730574 -0.002838093 -0.01309544  0.01377336  1.000000000
# analyzing these data with a linear model, quantile g-computation, and weighted quantile sum regression

# linear model gets it right (there are no coefficients suggesting a positive effect)
lm(y~x1 + x2 + x3 + x4 + x5,dat=dat$dat)
# Coefficients:
# (Intercept)           x1           x2           x3           x4           x5
#     2.92968     -0.04449     -0.97942      0.03914     -0.05154      0.08200

# qgcomp gets it right
qgcomp.noboot(y~.,dat=dat$dat, family=dat$family)
#             Estimate Std. Error Lower CI Upper CI t value  Pr(>|z|)
# (Intercept)  2.92968    0.16978   2.5969  3.26245  17.256 < 2.2e-16
# psi1        -0.95430    0.10901  -1.1680 -0.74064  -8.754 < 2.2e-16


#wqs conjures a positive effect out of nowhere
wqsresults = gwqs(y ~ wqs, mix_name = c("x1", "x2", "x3", "x4", "x5"), data = dat$dat, plan_strategy="multiprocess", validation=0.6, q = NULL, family = "gaussian", seed = NULL, plots=FALSE)
summary(wqsresults$fit)
#Coefficients:
#              Estimate Std. Error t value  Pr(>|t|)
#  (Intercept)   0.6791     0.2061   3.295   0.0011 **
#  wqs           0.5648     0.1292   4.371 1.71e-05 ***

wqsresults$final_weights
#  mix_name     mean_weight
#  x1       x1 0.4817969
#  x3       x3 0.2544034
#  x5       x5 0.1608151
#  x4       x4 0.1029847
#  x2       x2 0.0000000
	# Example in which weighted quantile sums estimates a significant, positive effect of exposure due to pure confounding
	# even though there are no exposures that have a positive effect on the outcome



	library(qgcomp)
	library(gWQS)

	dgm <- function(N = 100, b0=0, coef=c(1,0,0,0), family=gaussian(), ncor=0, corr=0.75, umc=FALSE){
	# simulate under data structure where WQS is the truth: e.g. an exposure with
	# multinomial distribution
	p = length(coef)
	if(ncor >= p) ncor = p-1
	X = matrix(nrow=N, ncol=p)
	colnames(X) <- paste0("x", 1:p)
	xmaster = sample(rep(0:3, length.out=N), N, replace=FALSE)
	for(k in 1:p){
	newx = numeric(N)
	c1 = as.logical(rbinom(N, 1, sqrt(corr)))
	newx[which(c1)] = xmaster[which(c1)]
	newx[which(!c1)] = sample(xmaster[which(!c1)])
	if(k<=(ncor+1)){
	X[,k] = newx
	} else X[,k] = sample(xmaster)
	}
	if(!umc) C = 0
	if(umc) {
	newx = numeric(N)
	c1 = as.logical(rbinom(N, 1, sqrt(corr)))
	newx[which(c1)] = X[,1][which(c1)]
	newx[which(!c1)] = sample(X[,1][which(!c1)])
	C = newx
	}
	py = b0 + X %% coef + C 0.5
	if(family$family=='binomial') y = rbinom(N, 1, 1/(1+exp(-py)))
	if(family$family=='gaussian') y = rnorm(N) + py
	list(dat=data.frame(X,y), family=family, C=C)
	}


	set.seed(123)
	# simulate so there is one causal effect in the negative direction that is correlated with a
	# single, non-causal exposure
	dat = dgm(N = 500, coef=c(0, 1, 0, 0, 0), ncor = 1, corr = .5, family=gaussian())
	dat$dat$x2=3-dat$dat$x2 # a trick to reverse the effect
	cor(dat$dat) # x2 is negatively correlated with X1 and negatively correlated with Y (which induces a positive bivariate correlation between X and Y)
	# x1 x2 x3 x4 x5 y
	#x1 1.0000000 -0.478400 -0.075200000 -0.02080000 0.01760000 0.339131386
	#x2 -0.4784000 1.000000 0.016000000 0.02880000 0.06080000 -0.730573991
	#x3 -0.0752000 0.016000 1.000000000 0.05920000 -0.02080000 -0.002838093
	#x4 -0.0208000 0.028800 0.059200000 1.00000000 -0.03680000 -0.013095436
	#x5 0.0176000 0.060800 -0.020800000 -0.03680000 1.00000000 0.013773360
	#y 0.3391314 -0.730574 -0.002838093 -0.01309544 0.01377336 1.000000000
	# analyzing these data with a linear model, quantile g-computation, and weighted quantile sum regression

	# linear model gets it right (there are no coefficients suggesting a positive effect)
	lm(y~x1 + x2 + x3 + x4 + x5,dat=dat$dat)
	# Coefficients:
	# (Intercept) x1 x2 x3 x4 x5
	# 2.92968 -0.04449 -0.97942 0.03914 -0.05154 0.08200

	# qgcomp gets it right
	qgcomp.noboot(y~.,dat=dat$dat, family=dat$family)
	# Estimate Std. Error Lower CI Upper CI t value Pr(>\|z\|)
	# (Intercept) 2.92968 0.16978 2.5969 3.26245 17.256 < 2.2e-16
	# psi1 -0.95430 0.10901 -1.1680 -0.74064 -8.754 < 2.2e-16


	#wqs conjures a positive effect out of nowhere
	wqsresults = gwqs(y ~ wqs, mix_name = c("x1", "x2", "x3", "x4", "x5"), data = dat$dat, plan_strategy="multiprocess", validation=0.6, q = NULL, family = "gaussian", seed = NULL, plots=FALSE)
	summary(wqsresults$fit)
	#Coefficients:
	# Estimate Std. Error t value Pr(>\|t\|)
	# (Intercept) 0.6791 0.2061 3.295 0.0011 **
	# wqs 0.5648 0.1292 4.371 1.71e-05 ***

	wqsresults$final_weights
	# mix_name mean_weight
	# x1 x1 0.4817969
	# x3 x3 0.2544034
	# x5 x5 0.1608151
	# x4 x4 0.1029847
	# x2 x2 0.0000000