Skip to content

Instantly share code, notes, and snippets.

@drsimonj
Last active July 3, 2024 04:24
Show Gist options
  • Save drsimonj/2001a6461d5dd2a4fd3fbc68520397b0 to your computer and use it in GitHub Desktop.
Save drsimonj/2001a6461d5dd2a4fd3fbc68520397b0 to your computer and use it in GitHub Desktop.
Simulating traffic interactions as described in https://vista.io/blog/interaction-effects-in-online-experimentation to see which of two types of tests are best suited
library(tidyverse)
# Simulation and test function --------------------------------------------
#' n = number of user IDs to simulate
#' p = Percentage of Experiment 1 traffic in treatment to exclude from experiment 2
simulate_and_test <- function(n = 1000, p = 0) {
#message("n = ", n, " and p = ", p)
# Simulate a random data set of users in or out of two experiments
d <- tibble(
user_id = seq_len(n),
exp_1 = sample(c("Control", "Treatment", NA), n, replace = TRUE),
exp_2 = sample(c("Control", "Treatment", NA), n, replace = TRUE)
)
# If desired, for users in experiment 1 treatment, remove them from experiment 2 with probability = p
# to simulate the relevant traffic interaction
if(p > 0) {
# Exclude p users from experiment 2 who are in experiment 1 treatment
d <- d %>%
mutate(exp_2 = if_else(exp_1 == "Treatment" & !is.na(exp_2) & runif(n) < p,
NA, exp_2))
}
# Count exposure combinations
combo_count <- d %>%
count(exp_1, exp_2)
# First test: chi-square test of independence of experiment 1 variant and being in/out of experiment 2:
cell_values <- combo_count %>%
filter(!is.na(exp_1)) %>%
mutate(exp_2 = if_else(is.na(exp_2), "Out", "In")) %>%
group_by(exp_1, exp_2) %>%
summarise(n = sum(n), .groups = "drop")
user_flow_matrix <- matrix(cell_values$n, nrow = 2)
dimnames(user_flow_matrix) <- list(exp_2 = unique(cell_values$exp_2),
exp_1 = unique(cell_values$exp_1))
user_flow_matrix
p_value_indepedence <- chisq.test(user_flow_matrix)$p.value
# Second test: chi-square test of goodness of fit of users being in each combo of experiment 1 and experiment 2:
# Note we're assuming 50/50 split but this could be accomodated
cell_values <- combo_count %>%
filter(!is.na(exp_1), !is.na(exp_2))
combo_ns <- cell_values$n
p_value_goodness <- chisq.test(x = combo_ns, p = c(.5*.5, .5*.5, .5*.5, .5*.5))$p.value
# Return the p-values of each test
c(independence_p = p_value_indepedence,
goodness_p = p_value_goodness)
}
#' Assume we have two experiments: 1 and 2.
#' Both split traffic 50/50 between two variants, Control and Treatment.
#' Let's start with a case that everything is fine.
#' We'll run 1000 iterations of simulated data and compare the results
i <- 1000
set.seed(20240603)
sims_unbiased <- map_df(seq_len(i), ~ simulate_and_test())
sims_unbiased %>%
ggplot(aes(independence_p, goodness_p)) +
geom_point()
sims_unbiased %>%
summarise(independence_fpr = sum(independence_p < .05)/n(),
goodness_fpr = sum(goodness_p < .05)/n())
# Both appear unbiased in their own right but they're completely uncorrelated.
#' Let's now assume the same setup BUT that there's some interaction problem.
#' Specifically, that traffic in the treatment of experiment 1 is less likely to
#' get to experiment 2.
#' We'll run 1000 iterations of simulated data and compare the results.
#' Across these, we'll vary the percentage of traffic that gets excluded from .05 to .3
i <- 1000
set.seed(20240603)
p <- runif(i, min = .05, max = .3)
sims_biased <- map2_df(seq_len(i), p, ~ simulate_and_test(p = .y))
sims_biased %>%
ggplot(aes(independence_p, goodness_p)) +
geom_point()
sims_biased %>%
ggplot(aes(independence_p)) +
geom_histogram()
sims_biased %>%
ggplot(aes(goodness_p)) +
geom_histogram()
sims_biased %>%
summarise(independence_fnr = sum(independence_p >= .05)/n(),
goodness_fnr = sum(goodness_p >= .05)/n())
# While both are detecting the bias, there's
# a clear increase in false negatives for the goodness of fit test.
# Suggesting that the test of independence is more effective for this case.
# NOTE however, that the test of independence naturally has a higher sample size.
# If you bump the sample sizes up in the simulation (e.g., n = 100,000), then the tests
# perform equally well with the current range of values for p.
# So, it's likely that at very high sample sizes, the problem is less concerning.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment