Skip to content

Instantly share code, notes, and snippets.

@steveharoz
Last active November 13, 2023 03:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save steveharoz/3181de58d75689400ebb64cd373dcb45 to your computer and use it in GitHub Desktop.
Save steveharoz/3181de58d75689400ebb64cd373dcb45 to your computer and use it in GitHub Desktop.
cohen's d by replicate count
# Simulate different experiment designs and approaches to calculating Cohen's D
# References
# https://journalofcognition.org/articles/10.5334/joc.10 (h/t Aaron Caldwell @arcstats.bsky.social)
# https://jakewestfall.org/blog/index.php/2016/03/25/five-different-cohens-d-statistics-for-within-subject-designs/
library(tidyverse)
library(lmerTest)
library(multidplyr) # not necessary, but helps performance
library(ggtext) # markdown labels
library(glue)
# reproducible
set.seed(8675309)
BASELINE = 0
BASELINE_SD = 0.2
RESPONSE_SD = .5
DIFFERENCE = .1
translation = tribble(
~d_type, ~d_type_label,
"da_between", "Between-subject Cohen's d<sub>a</sub>",
"dr_between", "Between-subject Cohen's d<sub>r</sub>",
"dz", "Within-subject Cohen's d<sub>z</sub>",
"dr", "Within-subject Cohen's d<sub>r</sub>"
)
#### Simulations ####
# simulate one between-subject experiment
simulate_between = function(count_a = ceiling(10/2), count_b = ceiling(10/2), replicate_count = 1) {
count_total = count_a + count_b
# each subject does one condition [replicate_count] times
data = tibble(
subjectID = paste0("S", 1:count_total),
baseline = BASELINE + rnorm(count_total, 0, BASELINE_SD),
condition = c(rep("A", count_a), rep("B", count_b))
) %>%
expand_grid(rep = 1:replicate_count)
# simulate responses
data %>% mutate(response = rnorm(
n = n(),
mean = baseline + ifelse(condition=="A", 0, DIFFERENCE),
sd = RESPONSE_SD)
)
}
# simulate one within-subject experiment
simulate_within = function(count = 10, replicate_count = 1) {
# each subject does each condition [replicate_count] times
data = tibble(
subjectID = paste0("S", 1:count),
baseline = BASELINE + rnorm(count, 0, BASELINE_SD)
) %>%
expand_grid(
condition = c("A", "B"),
rep = 1:replicate_count
)
# simulate responses
data %>% mutate(response = rnorm(
n = n(),
mean = baseline + ifelse(condition=="A", 0, DIFFERENCE),
sd = RESPONSE_SD)
)
}
#### Cohen's d calculations ####
# d_a
# aggregate replicates, ignore within
cohens_d_subject_means = function(data) {
# collapse replicates to get mean per subject*condition
subject_means = data %>%
summarise(.by = c(subjectID, condition), response = mean(response))
numerator = subject_means %>%
summarise(.by = c(condition), response = mean(response)) %>%
pull(response)
numerator = sum(numerator * c(-1, 1))
denominator = subject_means %>%
summarise(.by = c(condition), variance = var(response)) %>%
pull(variance)
denominator = sqrt(mean(denominator))
numerator / denominator # mean diff over sd
}
# d_z
# aggregate replicates, use per-subject diffs
cohens_d_subject_diffs = function(data) {
# collapse replicates to get mean per subject*condition
subject_means = data %>%
summarise(.by = c(subjectID, condition), response = mean(response))
data = subject_means %>%
pivot_wider(names_from = condition, values_from = response) %>%
mutate(diff = B - A)
mean(data$diff) / sd(data$diff)
}
# d_r
# include residual variance in the denominator
cohens_d_residual_variance = function(data) {
if(data %>% count(subjectID) %>% pull(n) %>% max() < 2)
return(cohens_d_subject_means_between(data))
fit = lmer(response ~ condition + (1|subjectID), data) %>% summary()
# cohen's d from fit
denominator = fit$varcor %>% as_tibble() %>% pull(vcov) %>% sum() %>% sqrt()
numerator = data %>% group_by(condition) %>% summarise(response = mean(response)) %>% pull(response)
numerator = sum(numerator * c(-1, 1))
numerator / denominator # mean diff over sd
}
# d_a (for between-subject)
# aggregate replicates
cohens_d_subject_means_between = function(data) {
# collapse replicates to get mean per subject*condition
subject_means = data %>%
summarise(.by = c(subjectID, condition), response = mean(response))
numerator = subject_means %>% summarise(.by = condition, mu=mean(response)) %>% pull(mu)
denominator = sqrt( subject_means %>% summarise(.by = condition, var=var(response)) %>% pull(var) %>% mean() )
sum(numerator * c(-1, 1)) / denominator
}
#### plot ###########
plot_aggregated = function(data) {
# collapse replicates to get mean per subject*condition
subject_means = data %>%
summarise(.by = c(subjectID, condition), response = mean(response))
subject_means = subject_means %>% left_join(
subject_means %>% summarize(.by=subjectID, baseline = mean(response))
) %>%
mutate(response_relative = response - baseline)
ggplot(subject_means) +
aes(x=condition, y=response_relative, group=subjectID) +
geom_point(size = 4) +
geom_line()
}
plot_aggregated_between = function(data) {
# collapse replicates to get mean per subject*condition
subject_means = data %>%
summarise(.by = c(subjectID, condition), response = mean(response))
subject_means = subject_means %>%
mutate(.by = condition, response_relative = response-mean(response))
ggplot(subject_means) +
aes(x=condition, y=response_relative) +
geom_point(size = 4)
}
#### test ####
(simulate_within(10, 2) %>% plot_aggregated()) +
(simulate_within(10, 20) %>% plot_aggregated()) +
(simulate_within(10, 200) %>% plot_aggregated())
simulate_between() %>% plot_aggregated_between()
simulate_between(10, 10, 1000) %>% cohens_d_subject_means_between()
simulate_within(10, 30) %>% plot_aggregated()
simulate_within(10, 30) %>% cohens_d_residual_variance()
simulate_within(10, 30) %>% cohens_d_subject_means_between()
#### simulate ####
run_simulations = function(subject_count, replicates) {
within_data = simulate_within(subject_count, replicates)
between_data = simulate_between(subject_count/2, subject_count/2, replicates)
list(
dz = within_data %>% cohens_d_subject_diffs(),
dr = within_data %>% cohens_d_residual_variance(),
#da_within = within_data %>% cohens_d_subject_means_between(),
da_between = between_data %>% cohens_d_subject_means_between(),
dr_between = between_data %>% cohens_d_residual_variance()
)
}
cluster = new_cluster(parallel::detectCores() - 1)
cluster_library(cluster, c("dplyr", "tidyr", "magrittr", "lmerTest"))
cluster_copy(cluster, c(
"BASELINE", "BASELINE_SD", "RESPONSE_SD", "DIFFERENCE",
"simulate_between", "simulate_within",
"cohens_d_subject_diffs", "cohens_d_residual_variance", "cohens_d_subject_means_between",
"run_simulations"))
SIMULATION_COUNT = 5000
data2 = expand_grid(
index = 1:SIMULATION_COUNT,
subject_count = c(10, 40),
replicates = 5^(0:4)
) %>%
rowwise() %>%
partition(cluster) %>% # improves performance, but not necessary
mutate(ds = list(run_simulations(subject_count, replicates))) %>%
collect() %>% # improves performance, but not necessary
ungroup() %>%
unnest_longer(ds, "d", "d_type")
# save it
#data %>% write_csv("~/statistics/cohens_d_simulations.csv")
beepr::beep(5)
#### plot ####
# confidence intervals
quantile_df <- function(x, probs = c(0.25, 0.5, 0.75)) {
tibble(
val = quantile(x, probs, na.rm = TRUE),
quant = probs
)
}
data %>%
reframe(.by = c(replicates, d_type, subject_count), quantile_df(d, c(0.025, 0.5, 0.975))) %>%
pivot_wider(names_from = quant, values_from = val) %>%
mutate(d_type = factor(d_type, levels=c("dz", "dr", "da_between", "dr_between"), labels=translation$d_type_label)) %>%
ggplot() +
aes(xmin = `0.025`, x = `0.5`, xmax = `0.975`, color = d_type, linetype = factor(subject_count),
y = as.numeric(factor(replicates)) - 0.15*scale(as.numeric(subject_count))) +
geom_vline(xintercept = 0) +
geom_pointrange(linewidth = 1, size = .5, key_glyph = draw_key_linerange) +
scale_x_continuous(breaks = -1:12, minor_breaks = FALSE) +
scale_y_continuous( minor_breaks = FALSE,
breaks = data$replicates %>% unique() %>% seq_along(),
labels = data$replicates %>% unique() %>% sort()) +
scale_color_brewer(palette = "Set1") +
guides(color = "none") +
facet_wrap(facets = vars(d_type), ncol = 1) +
theme_minimal(12) + theme(
strip.text = ggtext::element_markdown(),
legend.text = ggtext::element_markdown(),
) +
labs(x = "Cohen's d (95% CI)", y = "Number of replicate trials")
# dot histogram
library(ggdist)
translation %>%
expand_grid(subject_count = data$subject_count %>% unique()) %>%
mutate(design = glue("{dtype} (N = {subject_count})"))
p = data %>%
left_join(
translation %>%
expand_grid(subject_count = data$subject_count %>% unique()) %>%
mutate(design = glue("{d_type_label} (N = {subject_count})")) %>%
mutate(design = factor(design, levels = design))
) %>%
mutate(d_type = factor(d_type, levels=translation$d_type, labels=translation$d_type_label)) %>%
#filter(index < 50) %>%
ggplot() +
aes(x = d, y = factor(replicates), fill = d_type) +
#geom_vline(xintercept = DIFFERENCE/sqrt(RESPONSE_SD^2+BASELINE_SD^2), color = "grey92") +
geom_vline(xintercept = 0, color = "grey30") +
ggdist::geom_dots(color = NA, slab_linewidth = 0) +
scale_x_continuous(breaks=seq(-12,12,by=2), limits=c(-3, 5), expand=c(0,0), minor_breaks=FALSE) +
scale_fill_manual(values = c("#267CBF", "#335E7F", "#E41A1C", "#993D3E")) +
guides(fill = "none") +
facet_wrap(facets = vars(design), ncol=2, scales = "free_x") +
theme_minimal(12) + theme(
axis.title = element_text(hjust = 0, size = rel(0.8), color = "gray30"),
plot.caption = ggtext::element_markdown(color = "gray50", lineheight = 1.2, margin = margin(20,0,0,0)),
plot.subtitle = ggtext::element_markdown(size = 10, lineheight = 1.2),
strip.text = ggtext::element_markdown(hjust = 0, size = rel(1), margin = margin(20,0,0,0)),
plot.margin = margin(20, 20, 20, 20, "pt"),
panel.spacing.x = unit(10, "pt")) +
labs(x = "Cohen's d 🠚", y = "Replicate trial count 🠚",
title = "Within/between-subject, replicate trial count, and Cohen's d variant impact effect size and consistency",
subtitle = glue(
"Each scenario was simulated {SIMULATION_COUNT} times: <br>
• 2 conditions (A and B) and a continuous response<br>
• Individual differences in baseline: 𝑁(0, 0.2) <br>
• Response: subject_baseline + 0.1*(condition == B) + 𝑁(0, 0.5)"),
caption =
'Cohen\'s d definitions from "Five Different Cohen\'s d Statistics for Within-subject Designs" by Jake Westfall<br>
Simulation and chart by Steve Haroz')
# • Subjects: 10 <br>
ragg::agg_png("~/cohens d and replicate counts subj.png", 7200, 6000, scaling = 9)
p
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment