nstrayer/simulate_mnar_data.R

## simulate_mnar_data.R
library(tidyverse)

n <-  150
sensitivity_threshold <- 5
data <- tibble(
  a = rgamma(n = n, shape = 5, rate = 0.5),
  b = rgamma(n = n, shape = a/2, rate = 0.5)
)

generate_missing_data <- function(i){
  data_w_missing <- data %>%
    mutate(obs = 1:n()) %>%
    pivot_longer(c(a, b), names_to = "variable") %>%
    mutate(
      distance_from_threshold = sensitivity_threshold - value,
      prob_of_missing = ifelse(distance_from_threshold > 0,
                               distance_from_threshold/sensitivity_threshold,
                               0),
      missing = as.logical(rbinom(n = n(), size = 1, prob = prob_of_missing)),
      value = ifelse(missing, NA, value)
    ) %>%
    pivot_wider(
        id_cols = obs,
        names_from = variable,
        values_from = value
    ) %>%
    mutate(type = "no impute")

  bind_rows(
    data_w_missing,
      mutate(data_w_missing,
             a = ifelse(is.na(a), 0, a),
             b = ifelse(is.na(b), 0, b),
             type = "impute zero"),
      mutate(data_w_missing,
             a = ifelse(is.na(a), mean(a, na.rm = TRUE), a),
             b = ifelse(is.na(b), mean(b, na.rm = TRUE), b),
             type = "impute mean"),
      mutate(data_w_missing,
             a = ifelse(is.na(a), median(a, na.rm = TRUE), a),
             b = ifelse(is.na(b), median(b, na.rm = TRUE), b),
             type = "impute median"),
      mutate(data_w_missing,
             a = ifelse(is.na(a), min(a, na.rm = TRUE), a),
             b = ifelse(is.na(b), min(b, na.rm = TRUE), b),
             type = "impute min")
    ) %>%
    mutate(i = i)
}

num_sims <- 100

correlations <- 1:num_sims %>%
  map_dfr(generate_missing_data) %>%
  group_by(type, i) %>%
  summarise(
    pearson = cor(a,b, use = "complete.obs"),
    spearman = cor(a,b, method = "spearman", use = "complete.obs")
  ) %>%
  pivot_longer(c(pearson, spearman), names_to = "correlation_type")

true_correlations <- tibble(
  correlation_type = c("pearson", "spearman"),
  truth = c(cor(data$a, data$b), cor(data$a, data$b, method = "spearman")))

ggplot(correlations, aes(x = value)) +
  geom_density(fill = "steelblue", color = "steelblue") +
  facet_grid(type~correlation_type) +
  geom_vline(
    data = true_correlations,
    aes(xintercept = truth),
    color = "black"
  ) +
  labs(title = "Distribution of correlations by imputation type",
       subtitle = "Missingness structure is related to distance from minimum threashold\nVertical line is true correlation value",
       x = "correlation")
	library(tidyverse)

	n <- 150
	sensitivity_threshold <- 5
	data <- tibble(
	a = rgamma(n = n, shape = 5, rate = 0.5),
	b = rgamma(n = n, shape = a/2, rate = 0.5)
	)

	generate_missing_data <- function(i){
	data_w_missing <- data %>%
	mutate(obs = 1:n()) %>%
	pivot_longer(c(a, b), names_to = "variable") %>%
	mutate(
	distance_from_threshold = sensitivity_threshold - value,
	prob_of_missing = ifelse(distance_from_threshold > 0,
	distance_from_threshold/sensitivity_threshold,
	0),
	missing = as.logical(rbinom(n = n(), size = 1, prob = prob_of_missing)),
	value = ifelse(missing, NA, value)
	) %>%
	pivot_wider(
	id_cols = obs,
	names_from = variable,
	values_from = value
	) %>%
	mutate(type = "no impute")

	bind_rows(
	data_w_missing,
	mutate(data_w_missing,
	a = ifelse(is.na(a), 0, a),
	b = ifelse(is.na(b), 0, b),
	type = "impute zero"),
	mutate(data_w_missing,
	a = ifelse(is.na(a), mean(a, na.rm = TRUE), a),
	b = ifelse(is.na(b), mean(b, na.rm = TRUE), b),
	type = "impute mean"),
	mutate(data_w_missing,
	a = ifelse(is.na(a), median(a, na.rm = TRUE), a),
	b = ifelse(is.na(b), median(b, na.rm = TRUE), b),
	type = "impute median"),
	mutate(data_w_missing,
	a = ifelse(is.na(a), min(a, na.rm = TRUE), a),
	b = ifelse(is.na(b), min(b, na.rm = TRUE), b),
	type = "impute min")
	) %>%
	mutate(i = i)
	}

	num_sims <- 100

	correlations <- 1:num_sims %>%
	map_dfr(generate_missing_data) %>%
	group_by(type, i) %>%
	summarise(
	pearson = cor(a,b, use = "complete.obs"),
	spearman = cor(a,b, method = "spearman", use = "complete.obs")
	) %>%
	pivot_longer(c(pearson, spearman), names_to = "correlation_type")

	true_correlations <- tibble(
	correlation_type = c("pearson", "spearman"),
	truth = c(cor(data$a, data$b), cor(data$a, data$b, method = "spearman")))

	ggplot(correlations, aes(x = value)) +
	geom_density(fill = "steelblue", color = "steelblue") +
	facet_grid(type~correlation_type) +
	geom_vline(
	data = true_correlations,
	aes(xintercept = truth),
	color = "black"
	) +
	labs(title = "Distribution of correlations by imputation type",
	subtitle = "Missingness structure is related to distance from minimum threashold\nVertical line is true correlation value",
	x = "correlation")