Created
March 21, 2024 10:20
-
-
Save rubenarslan/70ae6296ee7208daf623ae19569c2c85 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## dumbed-down simulation of selection bias with noise (aka selecting on fielding of scale before fielding) | |
# set seed for reproducibility | |
set.seed(123) | |
# parameters | |
n_studies <- 100000 | |
noise_dist <- seq(0.01, 0.07, 0.01) | |
## draw distribution of true alpha values | |
## we use a beta distribution | |
# I just chose some parameters that seem realistic to me given that developers | |
# have subject matter expertise | |
true_alpha <- rbeta(n_studies, 4, 2, 5) | |
## we use this data frame to store simulation results | |
obs_count <- data.frame(row.names = noise_dist) | |
for (noise in noise_dist) { | |
# now, developers can observe alpha only wiht sampling error | |
pre_alpha <- sampling_noise(true_alpha, noise) | |
## they select on scales that meet alpha >= 0.7 | |
fielded <- ifelse(pre_alpha >= 0.7, true_alpha, NA) | |
noisy_sample <- data.frame(row.names = 1:n_studies) | |
# but they don't want to overfit, so they collect a fresh sample | |
noisy_sample[, as.character(noise)] <- sampling_noise(fielded, noise) | |
# this is the alpha that gets published | |
noisy_sample <- floor(noisy_sample * 100) / 100 | |
noisy_sample_dist <- table(noisy_sample) | |
alphas <- data.frame(alpha = noisy_sample[[as.character(noise)]]) | |
alphas$cutoff <- if_else(alphas$alpha == 0.7, "y", "n") | |
print(ggplot(alphas, aes(alpha, fill = cutoff, group = cutoff)) + geom_histogram(binwidth = 0.01) + ggtitle(paste("Noise ",noise))) | |
obs_count[as.character(noise), "lower"] <- noisy_sample_dist["0.69"] | |
obs_count[as.character(noise), "upper"] <- noisy_sample_dist["0.7"] | |
} | |
## plot differential .69-bin vs .70-bin as a function of noise | |
obs_count["noise"] <- noise_dist | |
obs_count["difference"] <- obs_count["upper"] - obs_count["lower"] | |
obs_count["difference"] <- obs_count["difference"] / n_studies | |
library(tidyverse) | |
obs_count %>% | |
ggplot(aes(noise, difference)) + | |
geom_point() + | |
geom_smooth() + | |
theme_minimal() + | |
xlab("SD Delta Post - Pre alpha") + | |
ylab("N Obs (.7, .71] - N Obs (.69, .7]") | |
ggsave("plots/difference_by_noise.png", width = 6, height = 4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment