Skip to content

Instantly share code, notes, and snippets.

@rubenarslan
Created March 21, 2024 10:20
Show Gist options
  • Save rubenarslan/70ae6296ee7208daf623ae19569c2c85 to your computer and use it in GitHub Desktop.
Save rubenarslan/70ae6296ee7208daf623ae19569c2c85 to your computer and use it in GitHub Desktop.
## dumbed-down simulation of selection bias with noise (aka selecting on fielding of scale before fielding)
# set seed for reproducibility
set.seed(123)
# parameters
n_studies <- 100000
noise_dist <- seq(0.01, 0.07, 0.01)
## draw distribution of true alpha values
## we use a beta distribution
# I just chose some parameters that seem realistic to me given that developers
# have subject matter expertise
true_alpha <- rbeta(n_studies, 4, 2, 5)
## we use this data frame to store simulation results
obs_count <- data.frame(row.names = noise_dist)
for (noise in noise_dist) {
# now, developers can observe alpha only wiht sampling error
pre_alpha <- sampling_noise(true_alpha, noise)
## they select on scales that meet alpha >= 0.7
fielded <- ifelse(pre_alpha >= 0.7, true_alpha, NA)
noisy_sample <- data.frame(row.names = 1:n_studies)
# but they don't want to overfit, so they collect a fresh sample
noisy_sample[, as.character(noise)] <- sampling_noise(fielded, noise)
# this is the alpha that gets published
noisy_sample <- floor(noisy_sample * 100) / 100
noisy_sample_dist <- table(noisy_sample)
alphas <- data.frame(alpha = noisy_sample[[as.character(noise)]])
alphas$cutoff <- if_else(alphas$alpha == 0.7, "y", "n")
print(ggplot(alphas, aes(alpha, fill = cutoff, group = cutoff)) + geom_histogram(binwidth = 0.01) + ggtitle(paste("Noise ",noise)))
obs_count[as.character(noise), "lower"] <- noisy_sample_dist["0.69"]
obs_count[as.character(noise), "upper"] <- noisy_sample_dist["0.7"]
}
## plot differential .69-bin vs .70-bin as a function of noise
obs_count["noise"] <- noise_dist
obs_count["difference"] <- obs_count["upper"] - obs_count["lower"]
obs_count["difference"] <- obs_count["difference"] / n_studies
library(tidyverse)
obs_count %>%
ggplot(aes(noise, difference)) +
geom_point() +
geom_smooth() +
theme_minimal() +
xlab("SD Delta Post - Pre alpha") +
ylab("N Obs (.7, .71] - N Obs (.69, .7]")
ggsave("plots/difference_by_noise.png", width = 6, height = 4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment