Created
June 9, 2025 13:41
-
-
Save Koalha/d0063156323d71703b09b82e2c0b3067 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ## Berkson's paradox or collider bias in surveys | |
| library(tidyverse) | |
| library(ggdist) | |
| library(ggview) | |
| library(dagitty) | |
| library(ggdag) | |
| theme_set(theme_minimal(base_family = "Karla")) | |
| ## A dag of the scenario | |
| dag_scen <- 'dag { | |
| bb="-0.5,-0.5,0.5,0.5" | |
| I1 [pos="-0.206,-0.143"] | |
| I2 [pos="-0.206,-0.092"] | |
| O1 [pos="-0.250,-0.143"] | |
| O2 [pos="-0.250,-0.092"] | |
| S [pos="-0.175,-0.116"] | |
| I1 -> S | |
| I2 -> S | |
| O1 -> I1 | |
| O2 -> I2 | |
| } | |
| ' | |
| dag_scen <- dag_scen |> dagitty() |> tidy_dagitty() | |
| labs <- tibble( | |
| x = mean(dag_scen$data$x) - 0.02, | |
| y = mean(dag_scen$data$y), | |
| xend = mean(dag_scen$data$xend), | |
| yend = mean(dag_scen$data$yend), | |
| label = "O = Opinion about topic\nI = Interest in topic\nS = Selection into sample" | |
| ) | |
| dag_scen |> | |
| ggdag() + | |
| theme_dag_blank(base_family = "Karla") + | |
| geom_label(aes(label = label), data = labs, hjust = 0) + | |
| canvas(width = 136, height = 80, units = "mm") | |
| ## Set up simulation | |
| ## Infinite superpopulation | |
| ## Invitations sent out to N people | |
| ## Attrition is a function of interest in the topics of the survey | |
| ## Assuming perfect measurement of true opinions | |
| sim0 <- function(N, topic_effect) { | |
| ## Opinions are uncorrelated in the population | |
| answer_topic_1 <- rnorm(N) | |
| answer_topic_2 <- rnorm(N) | |
| ## Interest in topic is correlated with opinions | |
| interest_topic_1 <- answer_topic_1 * topic_effect + rnorm(N) | |
| interest_topic_2 <- answer_topic_2 * topic_effect + rnorm(N) | |
| # cor(interest_topic_1, answer_topic_1) | |
| # cor(interest_topic_2, answer_topic_2) | |
| ## Probability of being in the sample (answering the survey) is a function of interest in the two | |
| ## topics. | |
| p_in_sample <- pnorm((interest_topic_1 + interest_topic_2)) | |
| in_sample <- rbinom(N, 1, p_in_sample) |> as.logical() | |
| # cor(data.frame(in_sample, interest_topic_1, interest_topic_2)) | |
| # plot(data.frame(p_in_sample, interest_topic_1, interest_topic_2)) | |
| ## Opinions are uncorrelated in the population but negatively correlated in the sample | |
| # cor(answer_topic_1, answer_topic_2) | |
| # cor(answer_topic_1[in_sample], answer_topic_2[in_sample]) | |
| # plot(answer_topic_1[in_sample], answer_topic_2[in_sample]) | |
| out <- data.frame(answer_topic_1, answer_topic_2, in_sample) | |
| out | |
| } | |
| sim <- function(N, topic_effect) { | |
| out <- sim0(N, topic_effect) | |
| cor_out <- cor(out[out$in_sample, 1:2])[1, 2] | |
| n <- sum(out$in_sample) | |
| c(cor_out, n) | |
| } | |
| ## Replicate 1000 times | |
| set.seed(1984) | |
| cors <- replicate(n = 10000, sim(N = 1000, topic_effect = 1), simplify = TRUE) | |
| ## Plot results | |
| cors <- t(cors) |> as_tibble() | |
| names(cors) <- c("Correlation", "Sample size") | |
| ## Histogram of Correlations | |
| cors |> | |
| ggplot(aes(Correlation)) + | |
| geom_vline(xintercept = 0, linetype = "dashed") + | |
| stat_halfeye(fill = "hotpink") + | |
| labs( | |
| title = "Correlation between opinions in the sample", | |
| subtitle = "Point is the mean, line shows 66 % and 95 % intervals\n10000 simulations, true correlation = 0", | |
| x = NULL, | |
| y = NULL | |
| ) + | |
| theme( | |
| panel.grid.major.y = element_blank(), | |
| panel.grid.minor.y = element_blank(), | |
| axis.text.y = element_blank() | |
| ) + | |
| canvas(width = 136, height = 80, units = "mm") | |
| # A more accurate dissection of one simulation | |
| set.seed(2025) | |
| s1 <- sim0(N = 1000, topic_effect = 1) | |
| s1 |> summarize(corr = cor(answer_topic_1, answer_topic_2)) | |
| s1 |> summarize(corr = cor(answer_topic_1, answer_topic_2), .by = in_sample) | |
| s1 |> | |
| ggplot(aes(answer_topic_1, answer_topic_2)) + | |
| geom_point(aes(colour = in_sample), alpha = 0.4) + | |
| stat_smooth(method = "lm", color = "black", level = 0) + | |
| stat_smooth( | |
| aes(color = in_sample), | |
| method = "lm", | |
| data = s1 |> filter(in_sample), | |
| level = 0, | |
| show.legend = FALSE | |
| ) + | |
| scale_color_manual( | |
| values = c("peachpuff", "salmon"), | |
| guide = guide_legend(), | |
| labels = c("Not in sample", "In sample") | |
| ) + | |
| coord_fixed() + | |
| labs( | |
| y = "Opinion on topic 2", | |
| x = "Opinion on topic 1", | |
| title = "Selection process causes opinions to correlate negatively in the sample" |> | |
| str_wrap(50), | |
| color = NULL | |
| ) + | |
| theme(legend.position = "bottom") + | |
| canvas(width = 136, height = 160, units = "mm") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment