ericpgreen/mrp_example.R

## mrp_example.R
# MRP for non-representative samples
# http://research.nivi.io/posts/2020-12-08-mister-p-helps-us-understand-vaccine-hesitancy/
# Eric Green (adapted from https://www.monicaalexander.com/posts/2019-08-07-mrp/)

# load packages
  library(tidyverse)
  library(ggtext)
  library(brms)
  library(tidybayes)

  set.seed(123456)


# notes
# -----------------------------------------------------------------------------

# you need two datasets:
# 1. population weights from a census or representative survey (pop_df)
# 2. data from your non-representative sample (sample_obs)

# important!!: the datasets must share in common the variables you will use to
#              post-stratify


# generate example data (you'll bring your own data and will skip this)
# -----------------------------------------------------------------------------

# get the population data and custom ggplot function
  load(url("https://www.dropbox.com/s/0u3lblqfv85hch3/pop_df.RData?dl=1"))

  pop_df <- pop_df %>%
    mutate(weight=1)

# calculate response frequency in the population
  overall_pop <- pop_df %>%
    group_by(dv) %>%
    count() %>%
    ungroup() %>%
    mutate(p = n/sum(n)) %>%
    mutate(source = "true population") %>%
    dplyr::select(source, dv, p)

# draw a convenience sample that is mostly young men
# ----------------------------------------------------
# 50 older men
  sample_obs_m1 <- pop_df %>%
    filter(gender=="male") %>%
    filter(age>35) %>%
    sample_n(50)

# 400 younger men
  sample_obs_m2 <- pop_df %>%
    filter(gender=="male") %>%
    filter(age<=35) %>%
    sample_n(400)

# 45 women
  sample_obs_f <- pop_df %>%
    filter(gender=="female") %>%
    sample_n(45)

# 5 individuals who identified as non-binary
  sample_obs_n <- pop_df %>%
    filter(gender=="non-binary") %>%
    sample_n(5)

# combine to make a sample of 500
  sample_obs <- sample_obs_f %>%
    bind_rows(sample_obs_m1) %>%
    bind_rows(sample_obs_m2) %>%
    bind_rows(sample_obs_n)


# your work starts here
# -----------------------------------------------------------------------------

# calculate cell proportions in the population data
  overall_prop <- pop_df %>%
    group_by(gender, ageCat) %>%         # modify for your example
    summarise(n = sum(weight))%>%        # use the weight var provided to you
    ungroup() %>%
    dplyr::select(gender, ageCat,        # modify for your example
                  n) %>%
    mutate(overall="overall") %>%
    group_by(overall) %>%
    mutate(prop = n/sum(n))  %>%
    ungroup()

# fit oridinal regression
# if your outcome is binary follow
# https://www.monicaalexander.com/posts/2019-08-07-mrp/

  fit <- brm(dv ~ (1|gender) + (1|ageCat), # modify for your example
             data = sample_obs,
             family = cumulative("probit"),
             control = list(adapt_delta = 0.99),
             cores = 4, # depends on your machine
             backend = "cmdstanr") # or rstan

# get point estimates and credible intervals for ordinal outcome
  overall_sample_est <- fit %>%
    add_predicted_draws(newdata = overall_prop,
                        allow_new_levels=TRUE) %>%
    mutate(
      dv = .prediction,
      p = 1
    ) %>%
    mutate(.prediction_prop = p*prop) %>%
    group_by(.draw, dv) %>%
    summarise(p = sum(.prediction_prop)) %>%
    group_by(dv) %>%
    mean_qi(p) %>%
  # modify for your example
    mutate(source = "biased sample, adjusted") %>%
    dplyr::select(source, dv, p) %>%
    mutate(dv = factor(dv,
                       levels=c("never", "rarely", "sometimes", "often"),
                       labels=c("never", "rarely", "sometimes", "often"),
                       ordered = TRUE))
	# MRP for non-representative samples
	# http://research.nivi.io/posts/2020-12-08-mister-p-helps-us-understand-vaccine-hesitancy/
	# Eric Green (adapted from https://www.monicaalexander.com/posts/2019-08-07-mrp/)

	# load packages
	library(tidyverse)
	library(ggtext)
	library(brms)
	library(tidybayes)

	set.seed(123456)


	# notes
	# -----------------------------------------------------------------------------

	# you need two datasets:
	# 1. population weights from a census or representative survey (pop_df)
	# 2. data from your non-representative sample (sample_obs)

	# important!!: the datasets must share in common the variables you will use to
	# post-stratify


	# generate example data (you'll bring your own data and will skip this)
	# -----------------------------------------------------------------------------

	# get the population data and custom ggplot function
	load(url("https://www.dropbox.com/s/0u3lblqfv85hch3/pop_df.RData?dl=1"))

	pop_df <- pop_df %>%
	mutate(weight=1)

	# calculate response frequency in the population
	overall_pop <- pop_df %>%
	group_by(dv) %>%
	count() %>%
	ungroup() %>%
	mutate(p = n/sum(n)) %>%
	mutate(source = "true population") %>%
	dplyr::select(source, dv, p)

	# draw a convenience sample that is mostly young men
	# ----------------------------------------------------
	# 50 older men
	sample_obs_m1 <- pop_df %>%
	filter(gender=="male") %>%
	filter(age>35) %>%
	sample_n(50)

	# 400 younger men
	sample_obs_m2 <- pop_df %>%
	filter(gender=="male") %>%
	filter(age<=35) %>%
	sample_n(400)

	# 45 women
	sample_obs_f <- pop_df %>%
	filter(gender=="female") %>%
	sample_n(45)

	# 5 individuals who identified as non-binary
	sample_obs_n <- pop_df %>%
	filter(gender=="non-binary") %>%
	sample_n(5)

	# combine to make a sample of 500
	sample_obs <- sample_obs_f %>%
	bind_rows(sample_obs_m1) %>%
	bind_rows(sample_obs_m2) %>%
	bind_rows(sample_obs_n)


	# your work starts here
	# -----------------------------------------------------------------------------

	# calculate cell proportions in the population data
	overall_prop <- pop_df %>%
	group_by(gender, ageCat) %>% # modify for your example
	summarise(n = sum(weight))%>% # use the weight var provided to you
	ungroup() %>%
	dplyr::select(gender, ageCat, # modify for your example
	n) %>%
	mutate(overall="overall") %>%
	group_by(overall) %>%
	mutate(prop = n/sum(n)) %>%
	ungroup()

	# fit oridinal regression
	# if your outcome is binary follow
	# https://www.monicaalexander.com/posts/2019-08-07-mrp/

	fit <- brm(dv ~ (1\|gender) + (1\|ageCat), # modify for your example
	data = sample_obs,
	family = cumulative("probit"),
	control = list(adapt_delta = 0.99),
	cores = 4, # depends on your machine
	backend = "cmdstanr") # or rstan

	# get point estimates and credible intervals for ordinal outcome
	overall_sample_est <- fit %>%
	add_predicted_draws(newdata = overall_prop,
	allow_new_levels=TRUE) %>%
	mutate(
	dv = .prediction,
	p = 1
	) %>%
	mutate(.prediction_prop = p*prop) %>%
	group_by(.draw, dv) %>%
	summarise(p = sum(.prediction_prop)) %>%
	group_by(dv) %>%
	mean_qi(p) %>%
	# modify for your example
	mutate(source = "biased sample, adjusted") %>%
	dplyr::select(source, dv, p) %>%
	mutate(dv = factor(dv,
	levels=c("never", "rarely", "sometimes", "often"),
	labels=c("never", "rarely", "sometimes", "often"),
	ordered = TRUE))