drsimonj/traffic_interaction_simulations.R

## traffic_interaction_simulations.R
library(tidyverse)

# Simulation and test function --------------------------------------------

#' n = number of user IDs to simulate
#' p = Percentage of Experiment 1 traffic in treatment to exclude from experiment 2
simulate_and_test <- function(n = 1000, p = 0) {
  #message("n = ", n, " and p = ", p)

  # Simulate a random data set of users in or out of two experiments
  d <- tibble(
    user_id = seq_len(n),
    exp_1 = sample(c("Control", "Treatment", NA), n, replace = TRUE),
    exp_2 = sample(c("Control", "Treatment", NA), n, replace = TRUE)
  )

  # If desired, for users in experiment 1 treatment, remove them from experiment 2 with probability = p
  # to simulate the relevant traffic interaction
  if(p > 0) {
    # Exclude p users from experiment 2 who are in experiment 1 treatment
    d <- d %>%
      mutate(exp_2 = if_else(exp_1 == "Treatment" & !is.na(exp_2) & runif(n) < p,
                             NA, exp_2))
  }

  # Count exposure combinations
  combo_count <- d %>%
    count(exp_1, exp_2)

  # First test: chi-square test of independence of experiment 1 variant and being in/out of experiment 2:
  cell_values <- combo_count %>%
    filter(!is.na(exp_1)) %>%
    mutate(exp_2 = if_else(is.na(exp_2), "Out", "In")) %>%
    group_by(exp_1, exp_2) %>%
    summarise(n = sum(n), .groups = "drop")

  user_flow_matrix <- matrix(cell_values$n, nrow = 2)
  dimnames(user_flow_matrix) <- list(exp_2  = unique(cell_values$exp_2),
                                     exp_1 = unique(cell_values$exp_1))
  user_flow_matrix

  p_value_indepedence <- chisq.test(user_flow_matrix)$p.value

  # Second test: chi-square test of goodness of fit of users being in each combo of experiment 1 and experiment 2:
  # Note we're assuming 50/50 split but this could be accomodated
  cell_values <- combo_count %>%
    filter(!is.na(exp_1), !is.na(exp_2))

  combo_ns <- cell_values$n

  p_value_goodness <- chisq.test(x = combo_ns, p = c(.5*.5, .5*.5, .5*.5, .5*.5))$p.value

  # Return the p-values of each test
  c(independence_p = p_value_indepedence,
    goodness_p = p_value_goodness)
}

#' Assume we have two experiments: 1 and 2.
#' Both split traffic 50/50 between two variants, Control and Treatment.
#' Let's start with a case that everything is fine.
#' We'll run 1000 iterations of simulated data and compare the results

i <- 1000
set.seed(20240603)
sims_unbiased <- map_df(seq_len(i), ~ simulate_and_test())

sims_unbiased %>%
  ggplot(aes(independence_p, goodness_p)) +
    geom_point()

sims_unbiased %>%
  summarise(independence_fpr = sum(independence_p < .05)/n(),
            goodness_fpr = sum(goodness_p < .05)/n())

# Both appear unbiased in their own right but they're completely uncorrelated.


#' Let's now assume the same setup BUT that there's some interaction problem.
#' Specifically, that traffic in the treatment of experiment 1 is less likely to
#' get to experiment 2.
#' We'll run 1000 iterations of simulated data and compare the results.
#' Across these, we'll vary the percentage of traffic that gets excluded from .05 to .3
i <- 1000
set.seed(20240603)
p <- runif(i, min = .05, max = .3)
sims_biased <- map2_df(seq_len(i), p, ~ simulate_and_test(p = .y))

sims_biased %>%
  ggplot(aes(independence_p, goodness_p)) +
  geom_point()

sims_biased %>%
  ggplot(aes(independence_p)) +
    geom_histogram()

sims_biased %>%
  ggplot(aes(goodness_p)) +
  geom_histogram()

sims_biased %>%
  summarise(independence_fnr = sum(independence_p >= .05)/n(),
            goodness_fnr = sum(goodness_p >= .05)/n())

# While both are detecting the bias, there's
# a clear increase in false negatives for the goodness of fit test.
# Suggesting that the test of independence is more effective for this case.

# NOTE however, that the test of independence naturally has a higher sample size.
# If you bump the sample sizes up in the simulation (e.g., n = 100,000), then the tests
# perform equally well with the current range of values for p.
# So, it's likely that at very high sample sizes, the problem is less concerning.
	library(tidyverse)

	# Simulation and test function --------------------------------------------

	#' n = number of user IDs to simulate
	#' p = Percentage of Experiment 1 traffic in treatment to exclude from experiment 2
	simulate_and_test <- function(n = 1000, p = 0) {
	#message("n = ", n, " and p = ", p)

	# Simulate a random data set of users in or out of two experiments
	d <- tibble(
	user_id = seq_len(n),
	exp_1 = sample(c("Control", "Treatment", NA), n, replace = TRUE),
	exp_2 = sample(c("Control", "Treatment", NA), n, replace = TRUE)
	)

	# If desired, for users in experiment 1 treatment, remove them from experiment 2 with probability = p
	# to simulate the relevant traffic interaction
	if(p > 0) {
	# Exclude p users from experiment 2 who are in experiment 1 treatment
	d <- d %>%
	mutate(exp_2 = if_else(exp_1 == "Treatment" & !is.na(exp_2) & runif(n) < p,
	NA, exp_2))
	}

	# Count exposure combinations
	combo_count <- d %>%
	count(exp_1, exp_2)

	# First test: chi-square test of independence of experiment 1 variant and being in/out of experiment 2:
	cell_values <- combo_count %>%
	filter(!is.na(exp_1)) %>%
	mutate(exp_2 = if_else(is.na(exp_2), "Out", "In")) %>%
	group_by(exp_1, exp_2) %>%
	summarise(n = sum(n), .groups = "drop")

	user_flow_matrix <- matrix(cell_values$n, nrow = 2)
	dimnames(user_flow_matrix) <- list(exp_2 = unique(cell_values$exp_2),
	exp_1 = unique(cell_values$exp_1))
	user_flow_matrix

	p_value_indepedence <- chisq.test(user_flow_matrix)$p.value

	# Second test: chi-square test of goodness of fit of users being in each combo of experiment 1 and experiment 2:
	# Note we're assuming 50/50 split but this could be accomodated
	cell_values <- combo_count %>%
	filter(!is.na(exp_1), !is.na(exp_2))

	combo_ns <- cell_values$n

	p_value_goodness <- chisq.test(x = combo_ns, p = c(.5.5, .5.5, .5.5, .5.5))$p.value

	# Return the p-values of each test
	c(independence_p = p_value_indepedence,
	goodness_p = p_value_goodness)
	}

	#' Assume we have two experiments: 1 and 2.
	#' Both split traffic 50/50 between two variants, Control and Treatment.
	#' Let's start with a case that everything is fine.
	#' We'll run 1000 iterations of simulated data and compare the results

	i <- 1000
	set.seed(20240603)
	sims_unbiased <- map_df(seq_len(i), ~ simulate_and_test())

	sims_unbiased %>%
	ggplot(aes(independence_p, goodness_p)) +
	geom_point()

	sims_unbiased %>%
	summarise(independence_fpr = sum(independence_p < .05)/n(),
	goodness_fpr = sum(goodness_p < .05)/n())

	# Both appear unbiased in their own right but they're completely uncorrelated.


	#' Let's now assume the same setup BUT that there's some interaction problem.
	#' Specifically, that traffic in the treatment of experiment 1 is less likely to
	#' get to experiment 2.
	#' We'll run 1000 iterations of simulated data and compare the results.
	#' Across these, we'll vary the percentage of traffic that gets excluded from .05 to .3
	i <- 1000
	set.seed(20240603)
	p <- runif(i, min = .05, max = .3)
	sims_biased <- map2_df(seq_len(i), p, ~ simulate_and_test(p = .y))

	sims_biased %>%
	ggplot(aes(independence_p, goodness_p)) +
	geom_point()

	sims_biased %>%
	ggplot(aes(independence_p)) +
	geom_histogram()

	sims_biased %>%
	ggplot(aes(goodness_p)) +
	geom_histogram()

	sims_biased %>%
	summarise(independence_fnr = sum(independence_p >= .05)/n(),
	goodness_fnr = sum(goodness_p >= .05)/n())

	# While both are detecting the bias, there's
	# a clear increase in false negatives for the goodness of fit test.
	# Suggesting that the test of independence is more effective for this case.

	# NOTE however, that the test of independence naturally has a higher sample size.
	# If you bump the sample sizes up in the simulation (e.g., n = 100,000), then the tests
	# perform equally well with the current range of values for p.
	# So, it's likely that at very high sample sizes, the problem is less concerning.