Skip to content

Instantly share code, notes, and snippets.

@Koalha
Created June 9, 2025 13:41
Show Gist options
  • Select an option

  • Save Koalha/d0063156323d71703b09b82e2c0b3067 to your computer and use it in GitHub Desktop.

Select an option

Save Koalha/d0063156323d71703b09b82e2c0b3067 to your computer and use it in GitHub Desktop.
## Berkson's paradox or collider bias in surveys
library(tidyverse)
library(ggdist)
library(ggview)
library(dagitty)
library(ggdag)
theme_set(theme_minimal(base_family = "Karla"))
## A dag of the scenario
dag_scen <- 'dag {
bb="-0.5,-0.5,0.5,0.5"
I1 [pos="-0.206,-0.143"]
I2 [pos="-0.206,-0.092"]
O1 [pos="-0.250,-0.143"]
O2 [pos="-0.250,-0.092"]
S [pos="-0.175,-0.116"]
I1 -> S
I2 -> S
O1 -> I1
O2 -> I2
}
'
dag_scen <- dag_scen |> dagitty() |> tidy_dagitty()
labs <- tibble(
x = mean(dag_scen$data$x) - 0.02,
y = mean(dag_scen$data$y),
xend = mean(dag_scen$data$xend),
yend = mean(dag_scen$data$yend),
label = "O = Opinion about topic\nI = Interest in topic\nS = Selection into sample"
)
dag_scen |>
ggdag() +
theme_dag_blank(base_family = "Karla") +
geom_label(aes(label = label), data = labs, hjust = 0) +
canvas(width = 136, height = 80, units = "mm")
## Set up simulation
## Infinite superpopulation
## Invitations sent out to N people
## Attrition is a function of interest in the topics of the survey
## Assuming perfect measurement of true opinions
sim0 <- function(N, topic_effect) {
## Opinions are uncorrelated in the population
answer_topic_1 <- rnorm(N)
answer_topic_2 <- rnorm(N)
## Interest in topic is correlated with opinions
interest_topic_1 <- answer_topic_1 * topic_effect + rnorm(N)
interest_topic_2 <- answer_topic_2 * topic_effect + rnorm(N)
# cor(interest_topic_1, answer_topic_1)
# cor(interest_topic_2, answer_topic_2)
## Probability of being in the sample (answering the survey) is a function of interest in the two
## topics.
p_in_sample <- pnorm((interest_topic_1 + interest_topic_2))
in_sample <- rbinom(N, 1, p_in_sample) |> as.logical()
# cor(data.frame(in_sample, interest_topic_1, interest_topic_2))
# plot(data.frame(p_in_sample, interest_topic_1, interest_topic_2))
## Opinions are uncorrelated in the population but negatively correlated in the sample
# cor(answer_topic_1, answer_topic_2)
# cor(answer_topic_1[in_sample], answer_topic_2[in_sample])
# plot(answer_topic_1[in_sample], answer_topic_2[in_sample])
out <- data.frame(answer_topic_1, answer_topic_2, in_sample)
out
}
sim <- function(N, topic_effect) {
out <- sim0(N, topic_effect)
cor_out <- cor(out[out$in_sample, 1:2])[1, 2]
n <- sum(out$in_sample)
c(cor_out, n)
}
## Replicate 1000 times
set.seed(1984)
cors <- replicate(n = 10000, sim(N = 1000, topic_effect = 1), simplify = TRUE)
## Plot results
cors <- t(cors) |> as_tibble()
names(cors) <- c("Correlation", "Sample size")
## Histogram of Correlations
cors |>
ggplot(aes(Correlation)) +
geom_vline(xintercept = 0, linetype = "dashed") +
stat_halfeye(fill = "hotpink") +
labs(
title = "Correlation between opinions in the sample",
subtitle = "Point is the mean, line shows 66 % and 95 % intervals\n10000 simulations, true correlation = 0",
x = NULL,
y = NULL
) +
theme(
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
axis.text.y = element_blank()
) +
canvas(width = 136, height = 80, units = "mm")
# A more accurate dissection of one simulation
set.seed(2025)
s1 <- sim0(N = 1000, topic_effect = 1)
s1 |> summarize(corr = cor(answer_topic_1, answer_topic_2))
s1 |> summarize(corr = cor(answer_topic_1, answer_topic_2), .by = in_sample)
s1 |>
ggplot(aes(answer_topic_1, answer_topic_2)) +
geom_point(aes(colour = in_sample), alpha = 0.4) +
stat_smooth(method = "lm", color = "black", level = 0) +
stat_smooth(
aes(color = in_sample),
method = "lm",
data = s1 |> filter(in_sample),
level = 0,
show.legend = FALSE
) +
scale_color_manual(
values = c("peachpuff", "salmon"),
guide = guide_legend(),
labels = c("Not in sample", "In sample")
) +
coord_fixed() +
labs(
y = "Opinion on topic 2",
x = "Opinion on topic 1",
title = "Selection process causes opinions to correlate negatively in the sample" |>
str_wrap(50),
color = NULL
) +
theme(legend.position = "bottom") +
canvas(width = 136, height = 160, units = "mm")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment