TCornulier/PowerGLM.md

## PowerGLM.md

      
    Raw
  

              PowerGLM.md
            
          
    GLM power simulation template

06/02/2021
Assume


two categorical predictors X1 and X2 (2 and 3 levels respectively)
their interaction
one continuous predictor Z (confounder) with range [-1, 1]
Linear predictor ~ X1*X2 + Z
balanced design for categorical predictors
Poisson response
inference focusses on significance of interaction

Parameter values


Intercept = 3
X1.B - X1.A = 2
X2.b - X2.a = -1.5
X2.c - X2.a = 0.5
Z = 2
X1B:X2b - X1A:X2a = -0.7
X1B:X2c - X1A:X2a = -0.3
type 1 error = 0.05


# assumed coef values (must correspond to columns of model matrix X)
# adjust as needed to reflect expected effect sizes
# (could be made random within the simulation loop
# to introduce uncertainty in effect sizes)
coef.values<- c(3, 2, -1.5, 0.5, 2, -0.7, -0.3)
alpha<- 0.05
Functions to simulate data

This is a basic implementation with the model hard-written in the
function. You will need to adjust the function to your model or data
structure.
dat.sim.pois<- function(Nrep, coef.val){
    # create all factor combinations
    factor.comb<- expand.grid(X1= LETTERS[1:2], X2= letters[1:3])
    # now, combine factor.comb with variable Z, 
    # and let R automatically recycle it Nrep times 
    # to achieve total sample size (given by the length of Z)
    dat<- data.frame(factor.comb, Z= runif(nrow(factor.comb)*Nrep, -1, 1))
    # create the predictor matrix corresponding to the desired model
    X<- model.matrix( ~ X1*X2 + Z, data= dat)
    # compute linear predictor
    LP<- X %*% coef.val
    # simulate Poisson observations
    Y<- rpois(n= length(LP), lambda= exp(LP))
    # and use as Poisson mean:
    output<- data.frame(Y, dat)
    output
}

# dat.sim.norm<- function(N, coef.val, sd.resp){
#   fac.comb<- expand.grid(X1= LETTERS[1:2], X2= letters[1:3])
#   dat<- data.frame(fac.comb, Z= runif(nrow(fac.comb)*N, -1, 1))
#   X<- model.matrix( ~ X1*X2 + Z, data= dat)
#   Y<- rnorm(n= nrow(X), mean= X %*% coef.val, sd= sd.resp)
#   dat<- data.frame(Y, X)
# }
Set up loops with 1000 simulations for each replication level

N.rep<- 2:30 # replication level per (X1, X2) combination
N.sim<- 1000 # the larger the more precise the simulation
sim.out<- matrix(NA, N.sim, length(N.rep))

# set a seed for reproducibility (optional)
set.seed(7)

# set a progress bar
pb<- txtProgressBar(min = 0, max = N.sim, width = 50, style = 3)

# run the simulations
for(i in 1:N.sim){
    for(j in 1:length(N.rep)){
        sim.dat<- dat.sim.pois(N= N.rep[j], coef.val= coef.values)
        sim.GLM<- glm(Y ~ X1*X2 + Z, data= sim.dat)
        sim.out[i, j]<- anova(sim.GLM, test= "Chisq")["X1:X2", "Pr(>Chi)"]
    }
    setTxtProgressBar(pb, value= i)
}
close(pb)
Plot output

# compute proportion significant per replication level:
out.pow<- apply(sim.out < alpha, 2, mean) 

plot(out.pow ~ N.rep, xlab= "Replication", ylab= "Proportion significant interactions", type= "l", lwd= 3, col= 2, ylim= c(0, 1))
abline(h= 0.8, lty= 3, col= grey(0.5), lwd= 2)

# what is the minimum replication achieving >= 0.8 power?
min.ok<- min(which(out.pow > 0.8))
N.rep[min.ok]
## [1] 10

abline(v= N.rep[min.ok])


10 or more replicates per factor combination yield a power over 80%


## PowerGLM.rmd
---
title: "GLM power simulation template"
date: "`r format(Sys.time(), '%d/%m/%Y')`"
editor_options:
  chunk_output_type: console
output: github_document
---

## Assume
* two categorical predictors X1 and X2 (2 and 3 levels respectively)
* their interaction
* one continuous predictor Z (confounder) with range [-1, 1]
* Linear predictor ~ X1*X2 + Z
* balanced design for categorical predictors
* Poisson response
* inference focusses on significance of interaction

## Parameter values
* Intercept = 3
* X1.B - X1.A = 2
* X2.b - X2.a = -1.5
* X2.c - X2.a = 0.5
* Z = 2
* X1B:X2b - X1A:X2a = -0.7
* X1B:X2c - X1A:X2a = -0.3
* type 1 error = 0.05

```{r}
# assumed coef values (must correspond to columns of model matrix X)
# adjust as needed to reflect expected effect sizes
# (could be made random within the simulation loop
# to introduce uncertainty in effect sizes)
coef.values<- c(3, 2, -1.5, 0.5, 2, -0.7, -0.3)
alpha<- 0.05

```

## Functions to simulate data

This is a basic implementation with the model hard-written in the function. You will need to adjust the function to your model or data structure.

```{r}
dat.sim.pois<- function(Nrep, coef.val){
	# create all factor combinations
	factor.comb<- expand.grid(X1= LETTERS[1:2], X2= letters[1:3])
	# now, combine factor.comb with variable Z,
	# and let R automatically recycle it Nrep times
	# to achieve total sample size (given by the length of Z)
	dat<- data.frame(factor.comb, Z= runif(nrow(factor.comb)*Nrep, -1, 1))
	# create the predictor matrix corresponding to the desired model
	X<- model.matrix( ~ X1*X2 + Z, data= dat)
	# compute linear predictor
	LP<- X %*% coef.val
	# simulate Poisson observations
	Y<- rpois(n= length(LP), lambda= exp(LP))
	# and use as Poisson mean:
	output<- data.frame(Y, dat)
	output
}

# dat.sim.norm<- function(N, coef.val, sd.resp){
# 	fac.comb<- expand.grid(X1= LETTERS[1:2], X2= letters[1:3])
# 	dat<- data.frame(fac.comb, Z= runif(nrow(fac.comb)*N, -1, 1))
# 	X<- model.matrix( ~ X1*X2 + Z, data= dat)
# 	Y<- rnorm(n= nrow(X), mean= X %*% coef.val, sd= sd.resp)
# 	dat<- data.frame(Y, X)
# }

```


## Set up loops with 1000 simulations for each replication level

```{r, results= F}
N.rep<- 2:30 # replication level per (X1, X2) combination
N.sim<- 1000 # the larger the more precise the simulation
sim.out<- matrix(NA, N.sim, length(N.rep))

# set a seed for reproducibility (optional)
set.seed(7)

# set a progress bar
pb<- txtProgressBar(min = 0, max = N.sim, width = 50, style = 3)

# run the simulations
for(i in 1:N.sim){
	for(j in 1:length(N.rep)){
		sim.dat<- dat.sim.pois(N= N.rep[j], coef.val= coef.values)
		sim.GLM<- glm(Y ~ X1*X2 + Z, data= sim.dat)
		sim.out[i, j]<- anova(sim.GLM, test= "Chisq")["X1:X2", "Pr(>Chi)"]
	}
	setTxtProgressBar(pb, value= i)
}
close(pb)

```

## Plot output

```{r}
# compute proportion significant per replication level:
out.pow<- apply(sim.out < alpha, 2, mean)

plot(out.pow ~ N.rep, xlab= "Replication", ylab= "Proportion significant interactions", type= "l", lwd= 3, col= 2, ylim= c(0, 1))
abline(h= 0.8, lty= 3, col= grey(0.5), lwd= 2)

# what is the minimum replication achieving >= 0.8 power?
min.ok<- min(which(out.pow > 0.8))
N.rep[min.ok]

abline(v= N.rep[min.ok])

```

> `r N.rep[min.ok]` or more replicates per factor combination yield a power over 80%

## unnamed-chunk-4-1.png

      
    Raw
  

              unnamed-chunk-4-1.png
	---
	title: "GLM power simulation template"
	date: "`r format(Sys.time(), '%d/%m/%Y')`"
	editor_options:
	chunk_output_type: console
	output: github_document
	---

	## Assume
	* two categorical predictors X1 and X2 (2 and 3 levels respectively)
	* their interaction
	* one continuous predictor Z (confounder) with range [-1, 1]
	* Linear predictor ~ X1*X2 + Z
	* balanced design for categorical predictors
	* Poisson response
	* inference focusses on significance of interaction

	## Parameter values
	* Intercept = 3
	* X1.B - X1.A = 2
	* X2.b - X2.a = -1.5
	* X2.c - X2.a = 0.5
	* Z = 2
	* X1B:X2b - X1A:X2a = -0.7
	* X1B:X2c - X1A:X2a = -0.3
	* type 1 error = 0.05

	```{r}
	# assumed coef values (must correspond to columns of model matrix X)
	# adjust as needed to reflect expected effect sizes
	# (could be made random within the simulation loop
	# to introduce uncertainty in effect sizes)
	coef.values<- c(3, 2, -1.5, 0.5, 2, -0.7, -0.3)
	alpha<- 0.05

	```

	## Functions to simulate data

	This is a basic implementation with the model hard-written in the function. You will need to adjust the function to your model or data structure.

	```{r}
	dat.sim.pois<- function(Nrep, coef.val){
	# create all factor combinations
	factor.comb<- expand.grid(X1= LETTERS[1:2], X2= letters[1:3])
	# now, combine factor.comb with variable Z,
	# and let R automatically recycle it Nrep times
	# to achieve total sample size (given by the length of Z)
	dat<- data.frame(factor.comb, Z= runif(nrow(factor.comb)*Nrep, -1, 1))
	# create the predictor matrix corresponding to the desired model
	X<- model.matrix( ~ X1*X2 + Z, data= dat)
	# compute linear predictor
	LP<- X %*% coef.val
	# simulate Poisson observations
	Y<- rpois(n= length(LP), lambda= exp(LP))
	# and use as Poisson mean:
	output<- data.frame(Y, dat)
	output
	}

	# dat.sim.norm<- function(N, coef.val, sd.resp){
	# fac.comb<- expand.grid(X1= LETTERS[1:2], X2= letters[1:3])
	# dat<- data.frame(fac.comb, Z= runif(nrow(fac.comb)*N, -1, 1))
	# X<- model.matrix( ~ X1*X2 + Z, data= dat)
	# Y<- rnorm(n= nrow(X), mean= X %*% coef.val, sd= sd.resp)
	# dat<- data.frame(Y, X)
	# }

	```


	## Set up loops with 1000 simulations for each replication level

	```{r, results= F}
	N.rep<- 2:30 # replication level per (X1, X2) combination
	N.sim<- 1000 # the larger the more precise the simulation
	sim.out<- matrix(NA, N.sim, length(N.rep))

	# set a seed for reproducibility (optional)
	set.seed(7)

	# set a progress bar
	pb<- txtProgressBar(min = 0, max = N.sim, width = 50, style = 3)

	# run the simulations
	for(i in 1:N.sim){
	for(j in 1:length(N.rep)){
	sim.dat<- dat.sim.pois(N= N.rep[j], coef.val= coef.values)
	sim.GLM<- glm(Y ~ X1*X2 + Z, data= sim.dat)
	sim.out[i, j]<- anova(sim.GLM, test= "Chisq")["X1:X2", "Pr(>Chi)"]
	}
	setTxtProgressBar(pb, value= i)
	}
	close(pb)

	```

	## Plot output

	```{r}
	# compute proportion significant per replication level:
	out.pow<- apply(sim.out < alpha, 2, mean)

	plot(out.pow ~ N.rep, xlab= "Replication", ylab= "Proportion significant interactions", type= "l", lwd= 3, col= 2, ylim= c(0, 1))
	abline(h= 0.8, lty= 3, col= grey(0.5), lwd= 2)

	# what is the minimum replication achieving >= 0.8 power?
	min.ok<- min(which(out.pow > 0.8))
	N.rep[min.ok]

	abline(v= N.rep[min.ok])

	```

	> `r N.rep[min.ok]` or more replicates per factor combination yield a power over 80%