markhwhiteii/weighting_exercise.R

## weighting_exercise.R
# define functions -------------------------------------------------------------
inv_logit <- function(x) exp(x) / (exp(x) + 1)

sim_pop <- function(N = 5000, b_min = 0, b_max = 1.5) {
  X <- cbind(1, rbinom(N, 1, .5), rbinom(N, 1, .5))
  B <- c(0, runif(1, b_min, b_max), runif(1, b_min, b_max))
  y <- rbinom(N, 1, inv_logit(X %*% B))
  return(as.data.frame(cbind(X[, -1], y)))
}

get_sample <- function(pop, p = c(.02, .02, .02, .02)) {
  p <- ifelse(rowSums(pop[, 1:2]) == 2, p[1],
         ifelse(rowSums(pop[, 1:2]) == 0, p[2],
                ifelse(pop[, 1] == 1, p[3], p[4])))
  sampled <- as.logical(rbinom(nrow(pop), 1, p))
  return(pop[sampled, ])
}

get_weights <- function(dat, pop) {
  tmp1 <- as.data.frame(prop.table(table(pop$V1)))
  tmp1$Freq <- tmp1$Freq * nrow(dat)
  names(tmp1)[1] <- "V1"
  tmp2 <- as.data.frame(prop.table(table(pop$V2)))
  tmp2$Freq <- tmp2$Freq * nrow(dat)
  names(tmp2)[1] <- "V2"

  result <- invisible(suppressWarnings(survey::svydesign(~ 1, data = dat)))
  result <- survey::rake(result, list(~V1, ~V2), list(tmp1, tmp2))

  return(weights(result))

}

run_iter <- function(pop, non_rand_p = c(.065, .005, .005, .005)) {
  pop_summary <- summary(glm(y ~ 1, binomial, pop))$coef[, 1:2]
  rnd_summary <- summary(glm(y ~ 1, binomial, get_sample(pop)))$coef[, 1:2]

  tmp <- get_sample(pop, non_rand_p)
  nrd_summary <- summary(glm(y ~ 1, binomial, tmp))$coef[, 1:2]
  wts <- get_weights(tmp, pop)
  wtd_summary <- summary(glm(y ~ 1, binomial, tmp, wts))$coef[, 1:2]

  return(list(
    population = pop_summary,
    rand_sample = rnd_summary,
    nonrand_sample = nrd_summary,
    weighted_sample = wtd_summary
  ))
}

# get results ------------------------------------------------------------------
library(tidyverse)
set.seed(1839)
iter <- 5000
results <- lapply(seq_len(iter), function(i) {
  do.call(rbind, run_iter(sim_pop())) %>%
    as.data.frame() %>%
    rownames_to_column("type") %>%
    gather("stat", "est", -type) %>%
    mutate(iter = i)
})
results <- do.call(bind_rows, results)

# estimates --------------------------------------------------------------------
(dist_ests <- results %>%
  filter(stat == "Estimate") %>%
  group_by(type) %>%
  summarise(mean = mean(est)))

results %>%
  filter(stat == "Estimate") %>%
  ggplot(aes(x = est)) +
  geom_density(alpha = .5) +
  facet_wrap(~ type) +
  geom_vline(aes(xintercept = dist_ests$mean[dist_ests$type == "population"]),
             linetype = "dotted")

# standard errors --------------------------------------------------------------
results %>%
  filter(stat == "Std. Error" & type != "population") %>%
  group_by(type) %>%
  summarise(avg_se = mean(est))
	# define functions -------------------------------------------------------------
	inv_logit <- function(x) exp(x) / (exp(x) + 1)

	sim_pop <- function(N = 5000, b_min = 0, b_max = 1.5) {
	X <- cbind(1, rbinom(N, 1, .5), rbinom(N, 1, .5))
	B <- c(0, runif(1, b_min, b_max), runif(1, b_min, b_max))
	y <- rbinom(N, 1, inv_logit(X %*% B))
	return(as.data.frame(cbind(X[, -1], y)))
	}

	get_sample <- function(pop, p = c(.02, .02, .02, .02)) {
	p <- ifelse(rowSums(pop[, 1:2]) == 2, p[1],
	ifelse(rowSums(pop[, 1:2]) == 0, p[2],
	ifelse(pop[, 1] == 1, p[3], p[4])))
	sampled <- as.logical(rbinom(nrow(pop), 1, p))
	return(pop[sampled, ])
	}

	get_weights <- function(dat, pop) {
	tmp1 <- as.data.frame(prop.table(table(pop$V1)))
	tmp1$Freq <- tmp1$Freq * nrow(dat)
	names(tmp1)[1] <- "V1"
	tmp2 <- as.data.frame(prop.table(table(pop$V2)))
	tmp2$Freq <- tmp2$Freq * nrow(dat)
	names(tmp2)[1] <- "V2"

	result <- invisible(suppressWarnings(survey::svydesign(~ 1, data = dat)))
	result <- survey::rake(result, list(~V1, ~V2), list(tmp1, tmp2))

	return(weights(result))

	}

	run_iter <- function(pop, non_rand_p = c(.065, .005, .005, .005)) {
	pop_summary <- summary(glm(y ~ 1, binomial, pop))$coef[, 1:2]
	rnd_summary <- summary(glm(y ~ 1, binomial, get_sample(pop)))$coef[, 1:2]

	tmp <- get_sample(pop, non_rand_p)
	nrd_summary <- summary(glm(y ~ 1, binomial, tmp))$coef[, 1:2]
	wts <- get_weights(tmp, pop)
	wtd_summary <- summary(glm(y ~ 1, binomial, tmp, wts))$coef[, 1:2]

	return(list(
	population = pop_summary,
	rand_sample = rnd_summary,
	nonrand_sample = nrd_summary,
	weighted_sample = wtd_summary
	))
	}

	# get results ------------------------------------------------------------------
	library(tidyverse)
	set.seed(1839)
	iter <- 5000
	results <- lapply(seq_len(iter), function(i) {
	do.call(rbind, run_iter(sim_pop())) %>%
	as.data.frame() %>%
	rownames_to_column("type") %>%
	gather("stat", "est", -type) %>%
	mutate(iter = i)
	})
	results <- do.call(bind_rows, results)

	# estimates --------------------------------------------------------------------
	(dist_ests <- results %>%
	filter(stat == "Estimate") %>%
	group_by(type) %>%
	summarise(mean = mean(est)))

	results %>%
	filter(stat == "Estimate") %>%
	ggplot(aes(x = est)) +
	geom_density(alpha = .5) +
	facet_wrap(~ type) +
	geom_vline(aes(xintercept = dist_ests$mean[dist_ests$type == "population"]),
	linetype = "dotted")

	# standard errors --------------------------------------------------------------
	results %>%
	filter(stat == "Std. Error" & type != "population") %>%
	group_by(type) %>%
	summarise(avg_se = mean(est))