mike-lawrence/main.r

## main.r
# load packages used ----
library(tidyverse)
library(broom)

options(dplyr.show_progress=TRUE)

# define a function to simulate an experiment ----
simulate_experiment = function(
	# simulation parameters ----
	  within_subject_noise = 1
	, between_subject_variability = 1
	, num_obs_per_subject = 10
	, num_subjects_per_group = 10
){
	# start with a tibble of subject true means ----
	tibble(
		subj_id = 1:(num_subjects_per_group*2)
		, subj_true_val = rnorm(
			num_subjects_per_group*2
			, 0
			, between_subject_variability
		)
	) %>%
		dplyr::group_by(subj_id) %>%
		#for each subject, generate observed measurements
		dplyr::summarise(
			trial_id = 1:num_obs_per_subject
			, value = rnorm(
				num_obs_per_subject
				, subj_true_val
				, within_subject_noise
			)
			, .groups = 'keep'
		) ->
		obs_data

	# prep for computing split-half reliability ----
	obs_data %>%
		#add split label
		dplyr::mutate(
			split = case_when(
				trial_id %% 2 == 1 ~ 'A'
				, trial_id %% 2 == 0 ~ 'B'
			)
		) %>%
		dplyr::group_by(
			subj_id
			, split
		) %>%
		#collapse to a mean
		dplyr::summarise(
			subj_split_obs_mean = mean(value)
			, .groups = 'drop'
		) %>%
		#spread
		tidyr::spread(
			key = split
			, value = subj_split_obs_mean
		) %>%
		#compute correlation
		dplyr::summarise(
			broom::tidy(cor.test(A,B))
		) ->
		cor_test_result

	#get the t-test result
	obs_data %>%
		#collapse to a mean
		dplyr::summarise(
			subj_obs_mean = mean(value)
			, .groups = 'drop'
		) %>%
		#add a group identifier
		dplyr::mutate(
			group = case_when(
				subj_id %% 2 == 1 ~ 'A'
				, subj_id %% 2 == 0 ~ 'B'
			)
			, group = factor(group)
		) %>%
		#now run a t-test
		dplyr::summarise(
			broom::tidy(t.test(subj_obs_mean~group))
		) ->
		t_test_result

	tibble(
		cor_test_result = list(cor_test_result)
		, t_test_result = list(t_test_result)
	) %>%
		return()
}

#run lots of simulated experiments with a variety of values for the experiment parameters
expand_grid(
	  iteration = 1:1e4 #10e4 gives stable estimates but takes a couple hours to run
	, within_subject_noise = c(.1,1,10)
	, between_subject_variability = 1
	, num_obs_per_subject = c(2,10,100)
	, num_subjects_per_group = c(2,10,100)
) %>%
	dplyr::group_by_all() %>%
	dplyr::summarise(
		simulate_experiment(
			  within_subject_noise
			, between_subject_variability
			, num_obs_per_subject
			, num_subjects_per_group
		)
		, .groups = 'keep'
	) ->
	all_experiment_results


all_experiment_results %>%
	# dplyr::ungroup() %>%
	# dplyr::slice(1) ->
	# .
	dplyr::group_by(
		 within_subject_noise
		, between_subject_variability
		, num_obs_per_subject
		, num_subjects_per_group
		, iteration
	) %>%
	dplyr::summarise(
		value = cor_test_result[[1]]$estimate
		, .groups = 'drop_last'
	) %>%
	dplyr::summarise(
		lo = quantile(value,.025)
		, hi = quantile(value,.975)
		, med = median(value)
		, .groups = 'drop'
	) %>%
	dplyr::mutate(
		num_subjects_per_group = factor(num_subjects_per_group)
	) %>%
	ggplot(
		mapping = aes(
			y = med
			, ymin = lo
			, ymax = hi
			, x = within_subject_noise
			, colour = num_subjects_per_group
			, shape = num_subjects_per_group
		)
	)+
	facet_wrap(
		~ num_obs_per_subject
		, labeller = label_both
	)+
	geom_point(alpha=.5)+
	geom_line(alpha=.5)+
	geom_errorbar(alpha=.5)


all_experiment_results %>%
	# dplyr::ungroup() %>%
	# dplyr::slice(1) ->
	# .
	dplyr::group_by(
		within_subject_noise
		, between_subject_variability
		, num_obs_per_subject
		, num_subjects_per_group
		, iteration
	) %>%
	dplyr::summarise(
		value = t_test_result[[1]]$p.value<.05
		, .groups = 'drop_last'
	) %>%
	dplyr::summarise(
		lo = broom::tidy(prop.test(sum(value),n()))$conf.low
		, hi = broom::tidy(prop.test(sum(value),n()))$conf.high
		, value = mean(value)
		, .groups = 'drop'
	) %>%
	dplyr::mutate(
		num_subjects_per_group = factor(num_subjects_per_group)
	) %>%
	ggplot(
		mapping = aes(
			y = value
			, ymin = lo
			, ymax = hi
			, x = within_subject_noise
			, colour = num_subjects_per_group
			, shape = num_subjects_per_group
		)
	)+
	facet_wrap(
		~ num_obs_per_subject
		, labeller = label_both
	)+
	geom_point(alpha=.5)+
	geom_line(alpha=.5)+
	geom_errorbar(alpha=.5)
	# load packages used ----
	library(tidyverse)
	library(broom)

	options(dplyr.show_progress=TRUE)

	# define a function to simulate an experiment ----
	simulate_experiment = function(
	# simulation parameters ----
	within_subject_noise = 1
	, between_subject_variability = 1
	, num_obs_per_subject = 10
	, num_subjects_per_group = 10
	){
	# start with a tibble of subject true means ----
	tibble(
	subj_id = 1:(num_subjects_per_group*2)
	, subj_true_val = rnorm(
	num_subjects_per_group*2
	, 0
	, between_subject_variability
	)
	) %>%
	dplyr::group_by(subj_id) %>%
	#for each subject, generate observed measurements
	dplyr::summarise(
	trial_id = 1:num_obs_per_subject
	, value = rnorm(
	num_obs_per_subject
	, subj_true_val
	, within_subject_noise
	)
	, .groups = 'keep'
	) ->
	obs_data

	# prep for computing split-half reliability ----
	obs_data %>%
	#add split label
	dplyr::mutate(
	split = case_when(
	trial_id %% 2 == 1 ~ 'A'
	, trial_id %% 2 == 0 ~ 'B'
	)
	) %>%
	dplyr::group_by(
	subj_id
	, split
	) %>%
	#collapse to a mean
	dplyr::summarise(
	subj_split_obs_mean = mean(value)
	, .groups = 'drop'
	) %>%
	#spread
	tidyr::spread(
	key = split
	, value = subj_split_obs_mean
	) %>%
	#compute correlation
	dplyr::summarise(
	broom::tidy(cor.test(A,B))
	) ->
	cor_test_result

	#get the t-test result
	obs_data %>%
	#collapse to a mean
	dplyr::summarise(
	subj_obs_mean = mean(value)
	, .groups = 'drop'
	) %>%
	#add a group identifier
	dplyr::mutate(
	group = case_when(
	subj_id %% 2 == 1 ~ 'A'
	, subj_id %% 2 == 0 ~ 'B'
	)
	, group = factor(group)
	) %>%
	#now run a t-test
	dplyr::summarise(
	broom::tidy(t.test(subj_obs_mean~group))
	) ->
	t_test_result

	tibble(
	cor_test_result = list(cor_test_result)
	, t_test_result = list(t_test_result)
	) %>%
	return()
	}

	#run lots of simulated experiments with a variety of values for the experiment parameters
	expand_grid(
	iteration = 1:1e4 #10e4 gives stable estimates but takes a couple hours to run
	, within_subject_noise = c(.1,1,10)
	, between_subject_variability = 1
	, num_obs_per_subject = c(2,10,100)
	, num_subjects_per_group = c(2,10,100)
	) %>%
	dplyr::group_by_all() %>%
	dplyr::summarise(
	simulate_experiment(
	within_subject_noise
	, between_subject_variability
	, num_obs_per_subject
	, num_subjects_per_group
	)
	, .groups = 'keep'
	) ->
	all_experiment_results


	all_experiment_results %>%
	# dplyr::ungroup() %>%
	# dplyr::slice(1) ->
	# .
	dplyr::group_by(
	within_subject_noise
	, between_subject_variability
	, num_obs_per_subject
	, num_subjects_per_group
	, iteration
	) %>%
	dplyr::summarise(
	value = cor_test_result[[1]]$estimate
	, .groups = 'drop_last'
	) %>%
	dplyr::summarise(
	lo = quantile(value,.025)
	, hi = quantile(value,.975)
	, med = median(value)
	, .groups = 'drop'
	) %>%
	dplyr::mutate(
	num_subjects_per_group = factor(num_subjects_per_group)
	) %>%
	ggplot(
	mapping = aes(
	y = med
	, ymin = lo
	, ymax = hi
	, x = within_subject_noise
	, colour = num_subjects_per_group
	, shape = num_subjects_per_group
	)
	)+
	facet_wrap(
	~ num_obs_per_subject
	, labeller = label_both
	)+
	geom_point(alpha=.5)+
	geom_line(alpha=.5)+
	geom_errorbar(alpha=.5)


	all_experiment_results %>%
	# dplyr::ungroup() %>%
	# dplyr::slice(1) ->
	# .
	dplyr::group_by(
	within_subject_noise
	, between_subject_variability
	, num_obs_per_subject
	, num_subjects_per_group
	, iteration
	) %>%
	dplyr::summarise(
	value = t_test_result[[1]]$p.value<.05
	, .groups = 'drop_last'
	) %>%
	dplyr::summarise(
	lo = broom::tidy(prop.test(sum(value),n()))$conf.low
	, hi = broom::tidy(prop.test(sum(value),n()))$conf.high
	, value = mean(value)
	, .groups = 'drop'
	) %>%
	dplyr::mutate(
	num_subjects_per_group = factor(num_subjects_per_group)
	) %>%
	ggplot(
	mapping = aes(
	y = value
	, ymin = lo
	, ymax = hi
	, x = within_subject_noise
	, colour = num_subjects_per_group
	, shape = num_subjects_per_group
	)
	)+
	facet_wrap(
	~ num_obs_per_subject
	, labeller = label_both
	)+
	geom_point(alpha=.5)+
	geom_line(alpha=.5)+
	geom_errorbar(alpha=.5)