BioSciEconomist/ex EValues.R

## ex EValues.R
# *-----------------------------------------------------------------
# | PROGRAM NAME: ex EValues.R
# | DATE: 1/6/22
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: explore using e-values for sensitifity to confounding in the context of OLS
# *----------------------------------------------------------------

# references:

# Mathur, M. B., Ding, P., Riddell, C. A., & VanderWeele, T. J. (2018). Web Site and R Package for Computing E-values.
# Epidemiology (Cambridge, Mass.), 29(5), e45–e47. https://doi.org/10.1097/EDE.0000000000000864

# https://cran.r-project.org/web/packages/EValue/vignettes/unmeasured-confounding.html
# https://cran.r-project.org/web/packages/EValue/EValue.pdf
# https://cdn1.sph.harvard.edu/wp-content/uploads/sites/603/2020/05/Evalue_Stata_Article.pdf
# https://biostat.duke.edu/sites/biostat.duke.edu/files/DukeBiostat2018.pdf
# https://www.degruyter.com/document/doi/10.1515/jci-2018-0007/html

options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation
library(dplyr)
library(EValue)

#
# example from: https://cran.r-project.org/web/packages/EValue/vignettes/unmeasured-confounding.html
#

# background
# Hammond and Horn estimated that cigarette smoking increased the risk of lung cancer by more than 10-fold.
# Fisher proposed that genetic confounding may explain all of the apparent association.
# We can calculate the magnitude of confounding that would be necessary to fully explain
# the estimated risk ratio of 10.73 (95% CI 8.02, 14.36):

evalues.RR(est = 10.73, lo = 8.02, hi = 14.36)

# The E-value of 20.95 tells us that a confounder, or set of confounders, such as that proposed by Fisher,
# would have to be associated with a 20-fold increase in the risk of lung cancer, and must be 20 times more
# prevalent in smokers than non-smokers, to explain the observed risk ratio. If the strength of one of these
# relationships were weaker, the other would have to be stronger for the causal effect of smoking on lung
# cancer to be truly null

#
# example OLS from: https://cran.r-project.org/web/packages/EValue/EValue.pdf
#

# note:

# This function is for linear regression with a continuous exposure and outcome.
# Regarding the con- tinuous exposure, the choice of delta defines essentially a
# dichotomization in the exposure between hypothetical groups of subjects with
# exposures equal to an arbitrary value c versus to another hypo- thetical group
# with exposures equal to c + delta. Regarding the continuous outcome, the function
# uses the effect-size conversions in Chinn (2000) and VanderWeele (2017) to approximately
# con- vert the mean difference between these exposure "groups" to the odds ratio that
# would arise from dichotomizing the continuous outcome.

# For example, if resulting E-value is 2, this means that unmeasured confounder(s)
# would need to double the probability of a subject’s having exposure equal to c + delta
# instead of c, and would also need to double the probability of being high versus low
# on the outcome, in which the cutoff for "high" versus "low" is arbitrary subject
# to some distributional assumptions (Chinn, 2000). A true standardized mean difference
# for linear regression would use sd = SD(Y | X, C), where Y is the outcome,
# X is the exposure of interest, and C are any adjusted covariates.
# See Examples for how to extract this from lm. A conservative approximation would
# instead use sd = SD(Y). Regardless, the reported E-value for the confidence
# interval treats sd as known, not estimated.

data ("lead") # load data from package
head(lead)

# first standardizing conservatively by SD(Y) data(lead)
summary(lm(age ~ income, data = lead))
ols = lm(age ~ income, data = lead)

# for a 1-unit increase in income
evalues.OLS(est = ols$coefficients[2], se = summary(ols)$coefficients['income', 'Std. Error'], sd = sd(lead$age))

# for a 0.5-unit increase in income
evalues.OLS(est = ols$coefficients[2], se = summary(ols)$coefficients['income', 'Std. Error'], sd = sd(lead$age),delta = 0.5)

# now use residual SD to avoid conservatism
evalues.OLS(est = ols$coefficients[2],
            se = summary(ols)$coefficients['income', 'Std. Error'], sd = summary(ols)$sigma)


#-----------------------------------
# example with lalond data
#----------------------------------

library(MatchIt)

data("lalonde") # load lalonde data

head(lalonde)

# first standardizing conservatively by SD(Y) data(lead)
summary(lm(re78 ~ treat, data = lalonde))
ols = lm(re78 ~ treat, data = lalonde)

# for a 1-unit change in 'treat' (assuming this is just the contrast between treatment and control for a binary treatment indicator)
evalues.OLS(est = ols$coefficients[2], se = summary(ols)$coefficients['treat', 'Std. Error'], sd = sd(lalonde$re78))

# OUTPUT
#          point  lower upper
# RR       0.9256 0.7914 1.082
# E-values 1.3752     NA 1.000

# Questions and Interpretation:

# ? RR = .9256 ~ converson of treatment effect from OLS (-635) to a RR per documentation

# E-value: with an observed RR = .9256, an unmeasured confounder assocaited with outcome (re78) and exposure (treat) would
# have to have a RR = 1.37 in order to explain away the estimate, but weaker confounding could not.

# ? given the relatively small RR (compared to the observed RR) you might conclude that confounding could plausibly explain
# away this relationship ?

#
# e-values with controls
#

# standardizing conservatively by SD(Y)
summary(lm(re78 ~  treat + age + educ + race + married + nodegree + re74 + re75, data = lalonde))
ols = lm(re78 ~ treat + age + educ + race + married + nodegree + re74 + re75, data = lalonde)

# for a 1-unit change in 'treat' (assuming this is just the contrast between treatment and control for a binary treatment indicator)
evalues.OLS(est = ols$coefficients[2], se = summary(ols)$coefficients['treat', 'Std. Error'], sd = sd(lalonde$re78))
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: ex EValues.R
	# \| DATE: 1/6/22
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: explore using e-values for sensitifity to confounding in the context of OLS
	# *----------------------------------------------------------------

	# references:

	# Mathur, M. B., Ding, P., Riddell, C. A., & VanderWeele, T. J. (2018). Web Site and R Package for Computing E-values.
	# Epidemiology (Cambridge, Mass.), 29(5), e45–e47. https://doi.org/10.1097/EDE.0000000000000864

	# https://cran.r-project.org/web/packages/EValue/vignettes/unmeasured-confounding.html
	# https://cran.r-project.org/web/packages/EValue/EValue.pdf
	# https://cdn1.sph.harvard.edu/wp-content/uploads/sites/603/2020/05/Evalue_Stata_Article.pdf
	# https://biostat.duke.edu/sites/biostat.duke.edu/files/DukeBiostat2018.pdf
	# https://www.degruyter.com/document/doi/10.1515/jci-2018-0007/html

	options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation
	library(dplyr)
	library(EValue)

	#
	# example from: https://cran.r-project.org/web/packages/EValue/vignettes/unmeasured-confounding.html
	#

	# background
	# Hammond and Horn estimated that cigarette smoking increased the risk of lung cancer by more than 10-fold.
	# Fisher proposed that genetic confounding may explain all of the apparent association.
	# We can calculate the magnitude of confounding that would be necessary to fully explain
	# the estimated risk ratio of 10.73 (95% CI 8.02, 14.36):

	evalues.RR(est = 10.73, lo = 8.02, hi = 14.36)

	# The E-value of 20.95 tells us that a confounder, or set of confounders, such as that proposed by Fisher,
	# would have to be associated with a 20-fold increase in the risk of lung cancer, and must be 20 times more
	# prevalent in smokers than non-smokers, to explain the observed risk ratio. If the strength of one of these
	# relationships were weaker, the other would have to be stronger for the causal effect of smoking on lung
	# cancer to be truly null

	#
	# example OLS from: https://cran.r-project.org/web/packages/EValue/EValue.pdf
	#

	# note:

	# This function is for linear regression with a continuous exposure and outcome.
	# Regarding the con- tinuous exposure, the choice of delta defines essentially a
	# dichotomization in the exposure between hypothetical groups of subjects with
	# exposures equal to an arbitrary value c versus to another hypo- thetical group
	# with exposures equal to c + delta. Regarding the continuous outcome, the function
	# uses the effect-size conversions in Chinn (2000) and VanderWeele (2017) to approximately
	# con- vert the mean difference between these exposure "groups" to the odds ratio that
	# would arise from dichotomizing the continuous outcome.

	# For example, if resulting E-value is 2, this means that unmeasured confounder(s)
	# would need to double the probability of a subject’s having exposure equal to c + delta
	# instead of c, and would also need to double the probability of being high versus low
	# on the outcome, in which the cutoff for "high" versus "low" is arbitrary subject
	# to some distributional assumptions (Chinn, 2000). A true standardized mean difference
	# for linear regression would use sd = SD(Y \| X, C), where Y is the outcome,
	# X is the exposure of interest, and C are any adjusted covariates.
	# See Examples for how to extract this from lm. A conservative approximation would
	# instead use sd = SD(Y). Regardless, the reported E-value for the confidence
	# interval treats sd as known, not estimated.

	data ("lead") # load data from package
	head(lead)

	# first standardizing conservatively by SD(Y) data(lead)
	summary(lm(age ~ income, data = lead))
	ols = lm(age ~ income, data = lead)

	# for a 1-unit increase in income
	evalues.OLS(est = ols$coefficients[2], se = summary(ols)$coefficients['income', 'Std. Error'], sd = sd(lead$age))

	# for a 0.5-unit increase in income
	evalues.OLS(est = ols$coefficients[2], se = summary(ols)$coefficients['income', 'Std. Error'], sd = sd(lead$age),delta = 0.5)

	# now use residual SD to avoid conservatism
	evalues.OLS(est = ols$coefficients[2],
	se = summary(ols)$coefficients['income', 'Std. Error'], sd = summary(ols)$sigma)


	#-----------------------------------
	# example with lalond data
	#----------------------------------

	library(MatchIt)

	data("lalonde") # load lalonde data

	head(lalonde)

	# first standardizing conservatively by SD(Y) data(lead)
	summary(lm(re78 ~ treat, data = lalonde))
	ols = lm(re78 ~ treat, data = lalonde)

	# for a 1-unit change in 'treat' (assuming this is just the contrast between treatment and control for a binary treatment indicator)
	evalues.OLS(est = ols$coefficients[2], se = summary(ols)$coefficients['treat', 'Std. Error'], sd = sd(lalonde$re78))

	# OUTPUT
	# point lower upper
	# RR 0.9256 0.7914 1.082
	# E-values 1.3752 NA 1.000

	# Questions and Interpretation:

	# ? RR = .9256 ~ converson of treatment effect from OLS (-635) to a RR per documentation

	# E-value: with an observed RR = .9256, an unmeasured confounder assocaited with outcome (re78) and exposure (treat) would
	# have to have a RR = 1.37 in order to explain away the estimate, but weaker confounding could not.

	# ? given the relatively small RR (compared to the observed RR) you might conclude that confounding could plausibly explain
	# away this relationship ?

	#
	# e-values with controls
	#

	# standardizing conservatively by SD(Y)
	summary(lm(re78 ~ treat + age + educ + race + married + nodegree + re74 + re75, data = lalonde))
	ols = lm(re78 ~ treat + age + educ + race + married + nodegree + re74 + re75, data = lalonde)

	# for a 1-unit change in 'treat' (assuming this is just the contrast between treatment and control for a binary treatment indicator)
	evalues.OLS(est = ols$coefficients[2], se = summary(ols)$coefficients['treat', 'Std. Error'], sd = sd(lalonde$re78))