mike-lawrence/2x2.r

## 2x2.r
library(tidyverse)
library(broom)
library(lme4)

#set default contrasts to halfsum (this matters!)
halfsum_contrasts = function (...) contr.sum(...) * 0.5
options(contrasts = c('halfsum_contrasts','contr.poly'))


run_sim = function(N,K,seed,do_checks=F){
	set.seed(seed)
	coefs = c(0,0,0,1)
	sds = c(3,3,3,3)
	noise = 4

	subj_coefs = matrix(
		rnorm(N*length(coefs),coefs,sds)
		, nrow = N
		, byrow = T
	)

	if(do_checks){
		#check that matrix came out right
		print(apply(subj_coefs,2,mean)) #should be close to coef
		print(apply(subj_coefs,2,sd)) #should be close to sds
		print(cor(subj_coefs)) #should be close to identity
	}

	#create the condition tibble and contrasts
	conditions = tidyr::expand_grid(
		a = factor(c('a_pre','a_post'))
		, b = factor(c('b_pre','b_post'))
	)
	contrasts = model.matrix(
		data = conditions
		, object = ~ a*b
	)

	#compute each subject's value-per-cell
	subj_vals = list()
	for(i in 1:N){
		subj_vals[[i]] = conditions
		subj_vals[[i]]$subj = i
		subj_vals[[i]]$value  = as.numeric(as.matrix(contrasts) %*% subj_coefs[i,])
	}
	subj_vals = dplyr::bind_rows(subj_vals)
	subj_vals$subj = factor(subj_vals$subj)

	if(do_checks){
		#compute difference between conditions
		# (this is why we use halfsum contrasts)
		subj_vals %>%
			dplyr::group_by(
				subj
			) %>%
			dplyr::summarise(
				intercept = mean(value)
				, ab = (
						mean(value[(a=='a_post')&(b=='b_post')]) -
						mean(value[(a=='a_pre')&(b=='b_post')])
					) - (
						mean(value[(a=='a_post')&(b=='b_pre')]) -
						mean(value[(a=='a_pre')&(b=='b_pre')])
					)
				, a = mean(value[a=='a_post']) - mean(value[a=='a_pre'])
				, b = mean(value[b=='b_post']) - mean(value[b=='b_pre'])
			) %>%
			tidyr::gather(
				key = key
				, value = value
				, -subj
			) %>%
			dplyr::group_by(
				key
			) %>%
			dplyr::summarise(
				mean = mean(value)
				, sd = sd(value)
			) #should roughly match coefs and sds
	}


	if(do_checks){
		#compute an lm per participant then check the mean/sd of resulting distributions
		subj_vals %>%
			dplyr::group_by(
				subj
			) %>%
			dplyr::summarise(
				broom::tidy(lm(value~a*b))
			) %>%
			dplyr::group_by(
				term
			) %>%
			dplyr::summarise(
				mean = mean(estimate)
				, sd = sd(estimate)
			) #should roughly match coefs and sds
	}

	#sample from each subject-by-condition mean with measurement noise
	subj_vals %>%
		dplyr::group_by(
			subj
			, a
			, b
		) %>%
		dplyr::summarise(
			value = rnorm(K,value,noise)
		) ->
		obs

	if(do_checks){
		#compute difference between conditions
		# (this is why we use halfsum contrasts)
		obs %>%
			dplyr::group_by(
				subj
			) %>%
			dplyr::summarise(
				intercept = mean(value)
				, ab = (
					mean(value[(a=='a_post')&(b=='b_post')]) -
					mean(value[(a=='a_pre')&(b=='b_post')])
				) - (
					mean(value[(a=='a_post')&(b=='b_pre')]) -
					mean(value[(a=='a_pre')&(b=='b_pre')])
				)
				, a = mean(value[a=='a_post']) - mean(value[a=='a_pre'])
				, b = mean(value[b=='b_post']) - mean(value[b=='b_pre'])
			) %>%
			tidyr::gather(
				key = key
				, value = value
				, -subj
			) %>%
			dplyr::group_by(
				key
			) %>%
			dplyr::summarise(
				mean = mean(value)
				, sd = sd(value)
			) #should roughly match coefs and sds
	}

	if(do_checks){
		#compute an lm per participant then check the mean/sd of resulting distributions
		obs %>%
			dplyr::group_by(
				subj
			) %>%
			dplyr::summarise(
				broom::tidy(lm(value~a*b))
			) %>%
			dplyr::group_by(
				term
			) %>%
			dplyr::summarise(
				mean = mean(estimate)
				, sd = sd(estimate)
			) #should roughly match coefs and sds
	}

	#fit and obtain intervals
	fit = lme4::lmer(
		formula = value ~ a*b + (a*b|subj)
		, data = obs
	)
	ci = confint(fit,method='Wald')

	if(do_checks){
		print(
			tibble(
				N = N
				, K = K
				, seed = seed
				, lo = ci[nrow(ci),1]
				, hi = ci[nrow(ci),2]
			)
		)
	}

	return(tibble(
		bound = c('lo','hi')
		, value = ci[nrow(ci),]
	))

}

#running once
run_sim(N=100,K=5,seed=3)

#running across a grid
tidyr::expand_grid(
	N = c(,20,40,80,160)
	, K = c(5,10,20)
	, iteration = 1:1e3
) %>%
	dplyr::mutate(
		seed=1:n()
	) %>%
	dplyr::group_by(
		N
		, K
		, iteration
	) %>%
	dplyr::summarize(
		run_sim(
			N = N
			, K = K
			, seed = seed
		)
	) ->
	out


out %>%
	tidyr::spread(
		key = bound
		, value = value
	) %>%
	dplyr::group_by(
		N
		, K
	) %>%
	dplyr::summarise(
		hi_less_than_zero = mean(hi<0)
		, hi_greater_than_zero = mean(hi>0)
		, lo_less_than_zero = mean(lo<0)
		, lo_greater_than_zero = mean(lo>0)
	)

## single_predictor.r
library(tidyverse)
library(broom)
library(lme4)

#set default contrasts to halfsum (this matters!)
halfsum_contrasts = function (...) contr.sum(...) * 0.5
options(contrasts = c('halfsum_contrasts','contr.poly'))


run_sim = function(N,K,seed,do_checks=F){
	set.seed(seed)
	coefs = c(0,1)
	sds = c(1,3)
	noise = 4

	subj_coefs = matrix(
		rnorm(N*length(coefs),coefs,sds)
		, nrow = N
		, byrow = T
	)

	if(do_checks){
		#check that matrix came out right
		print(apply(subj_coefs,2,mean)) #should be close to coef
		print(apply(subj_coefs,2,sd)) #should be close to sds
		print(cor(subj_coefs)) #should be close to identity
	}

	#create the condition tibble and contrasts
	conditions = tidyr::expand_grid(
		a = factor(c('a_pre','a_post'))
	)
	contrasts = model.matrix(
		data = conditions
		, object = ~ a
	)

	#compute each subject's value-per-cell
	subj_vals = list()
	for(i in 1:N){
		subj_vals[[i]] = conditions
		subj_vals[[i]]$subj = i
		subj_vals[[i]]$value  = as.numeric(as.matrix(contrasts) %*% subj_coefs[i,])
	}
	subj_vals = dplyr::bind_rows(subj_vals)
	subj_vals$subj = factor(subj_vals$subj)

	if(do_checks){
		#compute difference between conditions
		# (this is why we use halfsum contrasts)
		subj_vals %>%
			dplyr::group_by(
				subj
			) %>%
			dplyr::summarise(
				intercept = mean(value)
				, a = mean(value[a=='a_post']) - mean(value[a=='a_pre'])
			) %>%
			tidyr::gather(
				key = key
				, value = value
				, -subj
			) %>%
			dplyr::group_by(
				key
			) %>%
			dplyr::summarise(
				mean = mean(value)
				, sd = sd(value)
			) #should roughly match coefs and sds
	}


	if(do_checks){
		#compute an lm per participant then check the mean/sd of resulting distributions
		subj_vals %>%
			dplyr::group_by(
				subj
			) %>%
			dplyr::summarise(
				broom::tidy(lm(value~a))
			) %>%
			dplyr::group_by(
				term
			) %>%
			dplyr::summarise(
				mean = mean(estimate)
				, sd = sd(estimate)
			) #should roughly match coefs and sds
	}

	#sample from each subject-by-condition mean with measurement noise
	subj_vals %>%
		dplyr::group_by(
			subj
			, a
		) %>%
		dplyr::summarise(
			value = rnorm(K,value,noise)
		) ->
		obs

	if(do_checks){
		#compute difference between conditions
		# (this is why we use halfsum contrasts)
		obs %>%
			dplyr::group_by(
				subj
			) %>%
			dplyr::summarise(
				intercept = mean(value)
				, a = mean(value[a=='a_post']) - mean(value[a=='a_pre'])
			) %>%
			tidyr::gather(
				key = key
				, value = value
				, -subj
			) %>%
			dplyr::group_by(
				key
			) %>%
			dplyr::summarise(
				mean = mean(value)
				, sd = sd(value)
			) #should roughly match coefs and sds
	}

	if(do_checks){
		#compute an lm per participant then check the mean/sd of resulting distributions
		obs %>%
			dplyr::group_by(
				subj
			) %>%
			dplyr::summarise(
				broom::tidy(lm(value~a))
			) %>%
			dplyr::group_by(
				term
			) %>%
			dplyr::summarise(
				mean = mean(estimate)
				, sd = sd(estimate)
			) #should roughly match coefs and sds
	}

	#fit and obtain intervals
	fit = lme4::lmer(
		formula = value ~ a + (a|subj)
		, data = obs
	)
	ci = confint(fit,method='Wald')

	if(do_checks){
		print(
			tibble(
				N = N
				, K = K
				, seed = seed
				, lo = ci[nrow(ci),1]
				, hi = ci[nrow(ci),2]
			)
		)
	}

	return(tibble(
		bound = c('lo','hi')
		, value = ci[nrow(ci),]
	))

}

#running once
run_sim(N=100,K=5,seed=3)

#running across a grid
tidyr::expand_grid(
	N = c(20,40,80,160)
	, K = c(5,10,20)
	, iteration = 1:1e3
) %>%
	dplyr::mutate(
		seed=1:n()
	) %>%
	dplyr::group_by(
		N
		, K
		, iteration
	) %>%
	dplyr::summarize(
		run_sim(
			N = N
			, K = K
			, seed = seed
		)
	) ->
	out


out %>%
	tidyr::spread(
		key = bound
		, value = value
	) %>%
	dplyr::group_by(
		N
		, K
	) %>%
	dplyr::summarise(
		hi_less_than_zero = mean(hi<0)
		, hi_greater_than_zero = mean(hi>0)
		, lo_less_than_zero = mean(lo<0)
		, lo_greater_than_zero = mean(lo>0)
	)
	library(tidyverse)
	library(broom)
	library(lme4)

	#set default contrasts to halfsum (this matters!)
	halfsum_contrasts = function (...) contr.sum(...) * 0.5
	options(contrasts = c('halfsum_contrasts','contr.poly'))


	run_sim = function(N,K,seed,do_checks=F){
	set.seed(seed)
	coefs = c(0,0,0,1)
	sds = c(3,3,3,3)
	noise = 4

	subj_coefs = matrix(
	rnorm(N*length(coefs),coefs,sds)
	, nrow = N
	, byrow = T
	)

	if(do_checks){
	#check that matrix came out right
	print(apply(subj_coefs,2,mean)) #should be close to coef
	print(apply(subj_coefs,2,sd)) #should be close to sds
	print(cor(subj_coefs)) #should be close to identity
	}

	#create the condition tibble and contrasts
	conditions = tidyr::expand_grid(
	a = factor(c('a_pre','a_post'))
	, b = factor(c('b_pre','b_post'))
	)
	contrasts = model.matrix(
	data = conditions
	, object = ~ a*b
	)

	#compute each subject's value-per-cell
	subj_vals = list()
	for(i in 1:N){
	subj_vals[[i]] = conditions
	subj_vals[[i]]$subj = i
	subj_vals[[i]]$value = as.numeric(as.matrix(contrasts) %*% subj_coefs[i,])
	}
	subj_vals = dplyr::bind_rows(subj_vals)
	subj_vals$subj = factor(subj_vals$subj)

	if(do_checks){
	#compute difference between conditions
	# (this is why we use halfsum contrasts)
	subj_vals %>%
	dplyr::group_by(
	subj
	) %>%
	dplyr::summarise(
	intercept = mean(value)
	, ab = (
	mean(value[(a=='a_post')&(b=='b_post')]) -
	mean(value[(a=='a_pre')&(b=='b_post')])
	) - (
	mean(value[(a=='a_post')&(b=='b_pre')]) -
	mean(value[(a=='a_pre')&(b=='b_pre')])
	)
	, a = mean(value[a=='a_post']) - mean(value[a=='a_pre'])
	, b = mean(value[b=='b_post']) - mean(value[b=='b_pre'])
	) %>%
	tidyr::gather(
	key = key
	, value = value
	, -subj
	) %>%
	dplyr::group_by(
	key
	) %>%
	dplyr::summarise(
	mean = mean(value)
	, sd = sd(value)
	) #should roughly match coefs and sds
	}


	if(do_checks){
	#compute an lm per participant then check the mean/sd of resulting distributions
	subj_vals %>%
	dplyr::group_by(
	subj
	) %>%
	dplyr::summarise(
	broom::tidy(lm(value~a*b))
	) %>%
	dplyr::group_by(
	term
	) %>%
	dplyr::summarise(
	mean = mean(estimate)
	, sd = sd(estimate)
	) #should roughly match coefs and sds
	}

	#sample from each subject-by-condition mean with measurement noise
	subj_vals %>%
	dplyr::group_by(
	subj
	, a
	, b
	) %>%
	dplyr::summarise(
	value = rnorm(K,value,noise)
	) ->
	obs

	if(do_checks){
	#compute difference between conditions
	# (this is why we use halfsum contrasts)
	obs %>%
	dplyr::group_by(
	subj
	) %>%
	dplyr::summarise(
	intercept = mean(value)
	, ab = (
	mean(value[(a=='a_post')&(b=='b_post')]) -
	mean(value[(a=='a_pre')&(b=='b_post')])
	) - (
	mean(value[(a=='a_post')&(b=='b_pre')]) -
	mean(value[(a=='a_pre')&(b=='b_pre')])
	)
	, a = mean(value[a=='a_post']) - mean(value[a=='a_pre'])
	, b = mean(value[b=='b_post']) - mean(value[b=='b_pre'])
	) %>%
	tidyr::gather(
	key = key
	, value = value
	, -subj
	) %>%
	dplyr::group_by(
	key
	) %>%
	dplyr::summarise(
	mean = mean(value)
	, sd = sd(value)
	) #should roughly match coefs and sds
	}

	if(do_checks){
	#compute an lm per participant then check the mean/sd of resulting distributions
	obs %>%
	dplyr::group_by(
	subj
	) %>%
	dplyr::summarise(
	broom::tidy(lm(value~a*b))
	) %>%
	dplyr::group_by(
	term
	) %>%
	dplyr::summarise(
	mean = mean(estimate)
	, sd = sd(estimate)
	) #should roughly match coefs and sds
	}

	#fit and obtain intervals
	fit = lme4::lmer(
	formula = value ~ ab + (ab\|subj)
	, data = obs
	)
	ci = confint(fit,method='Wald')

	if(do_checks){
	print(
	tibble(
	N = N
	, K = K
	, seed = seed
	, lo = ci[nrow(ci),1]
	, hi = ci[nrow(ci),2]
	)
	)
	}

	return(tibble(
	bound = c('lo','hi')
	, value = ci[nrow(ci),]
	))

	}

	#running once
	run_sim(N=100,K=5,seed=3)

	#running across a grid
	tidyr::expand_grid(
	N = c(,20,40,80,160)
	, K = c(5,10,20)
	, iteration = 1:1e3
	) %>%
	dplyr::mutate(
	seed=1:n()
	) %>%
	dplyr::group_by(
	N
	, K
	, iteration
	) %>%
	dplyr::summarize(
	run_sim(
	N = N
	, K = K
	, seed = seed
	)
	) ->
	out


	out %>%
	tidyr::spread(
	key = bound
	, value = value
	) %>%
	dplyr::group_by(
	N
	, K
	) %>%
	dplyr::summarise(
	hi_less_than_zero = mean(hi<0)
	, hi_greater_than_zero = mean(hi>0)
	, lo_less_than_zero = mean(lo<0)
	, lo_greater_than_zero = mean(lo>0)
	)