m-Py/covariate_regression.R

## covariate_regression.R


## This document illustrates that type 1 sum of squares lead to increased alpha
## error rates when a predictive covariate is included in the regression model.


# Estimate p-value for treatment (null) effect via linear regression,
# including a covariate that is predictive of the outcome
#
# param N: sample size, default 100
# param beta_covariate: regression weight for a covariate on an outcome, default 0.5
# param coding: how is the treatment coded, defaults to 0/1
# param AOV: sum of squares method; if `FALSE`, the p-value is just estimated via `summary()`
#    (= default); if 1, the p-value is estimated via `anova()`; if 2 or 3, the p-value
#    is estimated using type 2 or type 3 sum of squares using `car::Anova()`
# return: the p-value associated with the "treatment"
#
# Details:
# Data is generated via a regression model, predicting the outcome from a covariate
# `outcome = beta_covariate * rnorm(N) + error`
# [`+ treatment * 0`, i.e. a null effect of treatment, is implicit]
#
covariate_regression <- function(
  N = 100,
  beta_covariate = 0.5,
  coding = c(0, 1),
  AOV = FALSE
) {
  # Simulate covariate and outcome data
  data <- data.frame(covariate = rnorm(N))
  error <- rnorm(N)
  data$outcome <- beta_covariate * data$covariate + error
  # Insert treatment variable that has no effect
  data$treatment <- sample(rep_len(coding, N))
  # do regression to test for treatment effect
  model <- lm(outcome ~ treatment * covariate, data = data)
  get_p_value(model, "treatment", AOV)
}

get_p_value <- function(model, effect, AOV = 0) {
  stopifnot(AOV %in% 0:3)
  if (AOV == 0) {
    tab <- summary(model)$coefficients
    # extract p-value
    return(tab[rownames(tab) == effect, "Pr(>|t|)"])
  } else if (AOV == 1) {
    tab <- anova(model)
    return(tab[rownames(tab) == effect, "Pr(>F)"])
  }
  tab <- car::Anova(model, type = AOV)
  tab[rownames(tab) == effect, "Pr(>F)"]
}


# Standard regression, using summary to print p-value:
mean(replicate(10000, covariate_regression()) <= .05)
#> 0.0513

# Use `anova()` to print p-value:
mean(replicate(10000, covariate_regression(AOV = 1)) <= .05)
#> 0.0828

# Coding scheme does not seem to matter:
mean(replicate(10000, covariate_regression(AOV = 1, coding = -1:1)) <= .05)
#> 0.0793

# No problem if covariate is not predictive of outcome:
mean(replicate(10000, covariate_regression(AOV = 1, beta_covariate = 0)) <= .05)
#> 0.05

# No problem for type 2 or type 3 sum of squares
mean(replicate(10000, covariate_regression(AOV = 2)) <= .05)
#> 0.0484

mean(replicate(10000, covariate_regression(AOV = 3)) <= .05)
#> 0.0531

### Problem disappears when the covariate comes first in the regression model!
# lm(outcome ~ covariate * treatment, data = data)


	## This document illustrates that type 1 sum of squares lead to increased alpha
	## error rates when a predictive covariate is included in the regression model.


	# Estimate p-value for treatment (null) effect via linear regression,
	# including a covariate that is predictive of the outcome
	#
	# param N: sample size, default 100
	# param beta_covariate: regression weight for a covariate on an outcome, default 0.5
	# param coding: how is the treatment coded, defaults to 0/1
	# param AOV: sum of squares method; if `FALSE`, the p-value is just estimated via `summary()`
	# (= default); if 1, the p-value is estimated via `anova()`; if 2 or 3, the p-value
	# is estimated using type 2 or type 3 sum of squares using `car::Anova()`
	# return: the p-value associated with the "treatment"
	#
	# Details:
	# Data is generated via a regression model, predicting the outcome from a covariate
	# `outcome = beta_covariate * rnorm(N) + error`
	# [`+ treatment * 0`, i.e. a null effect of treatment, is implicit]
	#
	covariate_regression <- function(
	N = 100,
	beta_covariate = 0.5,
	coding = c(0, 1),
	AOV = FALSE
	) {
	# Simulate covariate and outcome data
	data <- data.frame(covariate = rnorm(N))
	error <- rnorm(N)
	data$outcome <- beta_covariate * data$covariate + error
	# Insert treatment variable that has no effect
	data$treatment <- sample(rep_len(coding, N))
	# do regression to test for treatment effect
	model <- lm(outcome ~ treatment * covariate, data = data)
	get_p_value(model, "treatment", AOV)
	}

	get_p_value <- function(model, effect, AOV = 0) {
	stopifnot(AOV %in% 0:3)
	if (AOV == 0) {
	tab <- summary(model)$coefficients
	# extract p-value
	return(tab[rownames(tab) == effect, "Pr(>\|t\|)"])
	} else if (AOV == 1) {
	tab <- anova(model)
	return(tab[rownames(tab) == effect, "Pr(>F)"])
	}
	tab <- car::Anova(model, type = AOV)
	tab[rownames(tab) == effect, "Pr(>F)"]
	}


	# Standard regression, using summary to print p-value:
	mean(replicate(10000, covariate_regression()) <= .05)
	#> 0.0513

	# Use `anova()` to print p-value:
	mean(replicate(10000, covariate_regression(AOV = 1)) <= .05)
	#> 0.0828

	# Coding scheme does not seem to matter:
	mean(replicate(10000, covariate_regression(AOV = 1, coding = -1:1)) <= .05)
	#> 0.0793

	# No problem if covariate is not predictive of outcome:
	mean(replicate(10000, covariate_regression(AOV = 1, beta_covariate = 0)) <= .05)
	#> 0.05

	# No problem for type 2 or type 3 sum of squares
	mean(replicate(10000, covariate_regression(AOV = 2)) <= .05)
	#> 0.0484

	mean(replicate(10000, covariate_regression(AOV = 3)) <= .05)
	#> 0.0531

	### Problem disappears when the covariate comes first in the regression model!
	# lm(outcome ~ covariate * treatment, data = data)