statwonk/massive_logistic.R

## massive_logistic.R
library(tidyverse)
1e4 -> N
0.03 -> p
# author: twitter.com/statwonk
# showing how cases can be discarded in logistic regression while preserving an unbiased estimator
seq_len(1e3) %>%
  map_dbl(function(x) {
    rbinom(N, 1, p) -> y
    tibble(
      all_data = tibble(y = y) %>% glm(y ~ 1, "binomial", .) %>% coef() %>% plogis(),
      sampled_data = tibble(y = y[y == 1 | runif(N) <= p]) %>%
        mutate(weights = case_when(y == 1 ~ y*1.0, TRUE ~ 1/p)) %>%
        glm(y ~ 1, "binomial", ., weights = .$weights) %>%
        coef() %>%
        plogis()
    ) %>%
      mutate(diff = sampled_data - all_data) %>%
      pull(diff)
  }) %>%
  ecdf() %>%
  plot(main = "The difference in estimated p\np% sample - all data")
abline(v = 0)
	library(tidyverse)
	1e4 -> N
	0.03 -> p
	# author: twitter.com/statwonk
	# showing how cases can be discarded in logistic regression while preserving an unbiased estimator
	seq_len(1e3) %>%
	map_dbl(function(x) {
	rbinom(N, 1, p) -> y
	tibble(
	all_data = tibble(y = y) %>% glm(y ~ 1, "binomial", .) %>% coef() %>% plogis(),
	sampled_data = tibble(y = y[y == 1 \| runif(N) <= p]) %>%
	mutate(weights = case_when(y == 1 ~ y*1.0, TRUE ~ 1/p)) %>%
	glm(y ~ 1, "binomial", ., weights = .$weights) %>%
	coef() %>%
	plogis()
	) %>%
	mutate(diff = sampled_data - all_data) %>%
	pull(diff)
	}) %>%
	ecdf() %>%
	plot(main = "The difference in estimated p\np% sample - all data")
	abline(v = 0)