herbps10/primary_polling.R

## primary_polling.R
library(tidyverse)
library(extrafont)
library(rstan)

options(mc.cores = 4)

# Color scheme (flatuicolors.com)
flat_colors <- c(
  gray = "#7f8c8d",
  purple = "#8e44ad"
)

custom_theme <-
  cowplot::theme_cowplot() +
  theme(strip.background = element_rect(fill = "transparent"),
        strip.text = element_text(color = "black", family = "Open Sans"),
        axis.line = element_line(color = flat_colors['gray']),
        axis.ticks = element_line(color = flat_colors['gray']),
        axis.text = element_text(color = flat_colors['gray'], family = "Open Sans"),
        axis.title = element_text(color = flat_colors['gray']),
        legend.position = "bottom")

# Create a numeric index for each candidate
candidates <- tibble(candidate = c(
  "Bernard Sanders",
  "Beto O'Rourke",
  "Elizabeth Warren",
  "Joseph R. Biden Jr.",
  "Kamala D. Harris",
  "Pete Buttigieg"
)) %>% mutate(c = 1:n())

data_url <-
  "https://projects.fivethirtyeight.com/polls-page/president_primary_polls.csv"

# Read poll data from FiveThirtyEight
polls <- read_csv(data_url, guess_max = 10000) %>%
  rename(candidate = candidate_name) %>%

  # Filter for national polls of 2020 Democraticprimary
  filter(cycle == "2020", party == "DEM", is.na(state)) %>%

  # Filter for subset of candidates
  filter(candidate %in% candidates$candidate) %>%

  # Parse poll start time into date
  mutate(start_date = lubridate::mdy(start_date)) %>%

  # Join in candidate index
  left_join(candidates, by = c(candidate = "candidate")) %>%

  # Create time point index
  mutate(t = as.numeric(start_date - min(start_date)) + 1)

debates <- tribble(
  ~date,
  "6/26/19",
  "7/30/19"
) %>%
  mutate(date = lubridate::mdy(date),
         t = as.numeric(date - min(polls$start_date)) + 1)

ggplot(polls, aes(x = start_date, y = pct)) +
  geom_point(size = 0.5) +
  scale_x_date() +
  geom_vline(data = debates, aes(xintercept = date), lty = 2) +
  facet_wrap(~candidate) +
  labs(x = "Date", y = "Poll result", caption = "Data: FiveThirtyEight", color = "") +
  scale_y_continuous(labels = function(x) paste0(x, "%")) +
  scale_x_date() +
  scale_color_manual(values = c("#8e44ad")) +
  scale_fill_manual(values = c("#8e44ad")) +
  custom_theme

stan_data <- list(
  C = max(polls$c),
  T = max(polls$t),
  N = nrow(polls),
  y = round((polls$pct / 100) * polls$sample_size),
  sample_size = polls$sample_size,
  get_t_i = polls$t,
  get_c_i = polls$c,
  get_p_i = polls$p
)

fit <- sampling(model, stan_data, iter = 1000, control = list(adapt_delta = 0.8, max_treedepth = 8))

delta_post <- tidybayes::spread_draws(fit, delta[c, t]) %>%
  left_join(candidates)

delta_post_summarized <- delta_post %>%
  group_by(c, candidate, t) %>%
  summarize(posterior_median = quantile(delta, 0.5),
            posterior_2.5    = quantile(delta, 0.025),
            posterior_25     = quantile(delta, 0.25),
            posterior_75     = quantile(delta, 0.75),
            posterior_97.5   = quantile(delta, 0.975))


delta_post_summarized %>%
  # Convert time index back into date
  mutate(date = min(polls$start_date) + t - 1) %>%
  ggplot() +

  # Mark the democratic primary dates
  geom_vline(data = debates, aes(xintercept = date, color = "Democratic Debates"), lty = 2) +

  # 75% and 95% credibility intervals
  geom_ribbon(aes(x = date, y = posterior_median * 100, ymin = posterior_2.5 * 100, ymax = posterior_97.5 * 100),
              alpha = 0.25,
              fill = "#3498db") +
  geom_ribbon(aes(x = date, y = posterior_median * 100, ymin = posterior_25 * 100, ymax = posterior_75 * 100),
              alpha = 0.25,
              fill = "#3498db") +

  # Observed data points
  geom_point(data = polls, aes(x = start_date, y = pct), color = "#2c3e50", alpha = 1, shape = 19, size = 0.5) +

  # Posterior median
  geom_line(aes(x = date, y = posterior_median * 100), color = "#2980b9", size = 1) +

  # Separate panel for each candidate
  facet_wrap(~candidate, scales = "free") +

  # Formatting
  labs(x = "Date", y = "Poll result", caption = "Data: FiveThirtyEight", color = "") +
  scale_y_continuous(labels = function(x) paste0(x, "%")) +
  scale_x_date() +
  scale_color_manual(values = c("#8e44ad")) +
  scale_fill_manual(values = c("#8e44ad"))

## random_walk.stan
data {
  int<lower=0> T; // Number of timepoints
  int<lower=0> C; // Number of candidates
  int<lower=0> N; // Number of poll yervations

  int sample_size[N]; // Sample size of each poll
  int y[N]; // Number of respondents in poll for candidate (approximate)
  int<lower=1, upper=T> get_t_i[N]; // timepoint for ith observation
  int<lower=1, upper=C> get_c_i[N]; // candidate for ith observation
}
parameters {
  matrix[C, T] delta_logit; // Percent for candidate c at time t
  real<lower=0, upper=1> phi[N]; // Percent of participants in poll for candidate
  real<lower=0> tau; // Random walk variance
  real<lower=0,upper=0.5> sigma; // Overdispersion of observations
}
model {
  // Priors
  tau ~ normal(0, 0.2);
  sigma ~ normal(0, 1);

  // Random walk
  for(c in 1:C) {
    delta_logit[c, 2:T] ~ normal(delta_logit[c, 1:(T - 1)], tau);
  }

  // Observed data
  y ~ binomial(sample_size, phi);
  for(i in 1:N) {
    // Overdispersion
    delta_logit[get_c_i[i], get_t_i[i]] ~ normal(logit(phi[i]), sigma);
  }
}
generated quantities {
  matrix[C, T] delta = inv_logit(delta_logit);
}
	library(tidyverse)
	library(extrafont)
	library(rstan)

	options(mc.cores = 4)

	# Color scheme (flatuicolors.com)
	flat_colors <- c(
	gray = "#7f8c8d",
	purple = "#8e44ad"
	)

	custom_theme <-
	cowplot::theme_cowplot() +
	theme(strip.background = element_rect(fill = "transparent"),
	strip.text = element_text(color = "black", family = "Open Sans"),
	axis.line = element_line(color = flat_colors['gray']),
	axis.ticks = element_line(color = flat_colors['gray']),
	axis.text = element_text(color = flat_colors['gray'], family = "Open Sans"),
	axis.title = element_text(color = flat_colors['gray']),
	legend.position = "bottom")

	# Create a numeric index for each candidate
	candidates <- tibble(candidate = c(
	"Bernard Sanders",
	"Beto O'Rourke",
	"Elizabeth Warren",
	"Joseph R. Biden Jr.",
	"Kamala D. Harris",
	"Pete Buttigieg"
	)) %>% mutate(c = 1:n())

	data_url <-
	"https://projects.fivethirtyeight.com/polls-page/president_primary_polls.csv"

	# Read poll data from FiveThirtyEight
	polls <- read_csv(data_url, guess_max = 10000) %>%
	rename(candidate = candidate_name) %>%

	# Filter for national polls of 2020 Democraticprimary
	filter(cycle == "2020", party == "DEM", is.na(state)) %>%

	# Filter for subset of candidates
	filter(candidate %in% candidates$candidate) %>%

	# Parse poll start time into date
	mutate(start_date = lubridate::mdy(start_date)) %>%

	# Join in candidate index
	left_join(candidates, by = c(candidate = "candidate")) %>%

	# Create time point index
	mutate(t = as.numeric(start_date - min(start_date)) + 1)

	debates <- tribble(
	~date,
	"6/26/19",
	"7/30/19"
	) %>%
	mutate(date = lubridate::mdy(date),
	t = as.numeric(date - min(polls$start_date)) + 1)

	ggplot(polls, aes(x = start_date, y = pct)) +
	geom_point(size = 0.5) +
	scale_x_date() +
	geom_vline(data = debates, aes(xintercept = date), lty = 2) +
	facet_wrap(~candidate) +
	labs(x = "Date", y = "Poll result", caption = "Data: FiveThirtyEight", color = "") +
	scale_y_continuous(labels = function(x) paste0(x, "%")) +
	scale_x_date() +
	scale_color_manual(values = c("#8e44ad")) +
	scale_fill_manual(values = c("#8e44ad")) +
	custom_theme

	stan_data <- list(
	C = max(polls$c),
	T = max(polls$t),
	N = nrow(polls),
	y = round((polls$pct / 100) * polls$sample_size),
	sample_size = polls$sample_size,
	get_t_i = polls$t,
	get_c_i = polls$c,
	get_p_i = polls$p
	)

	fit <- sampling(model, stan_data, iter = 1000, control = list(adapt_delta = 0.8, max_treedepth = 8))

	delta_post <- tidybayes::spread_draws(fit, delta[c, t]) %>%
	left_join(candidates)

	delta_post_summarized <- delta_post %>%
	group_by(c, candidate, t) %>%
	summarize(posterior_median = quantile(delta, 0.5),
	posterior_2.5 = quantile(delta, 0.025),
	posterior_25 = quantile(delta, 0.25),
	posterior_75 = quantile(delta, 0.75),
	posterior_97.5 = quantile(delta, 0.975))



	delta_post_summarized %>%
	# Convert time index back into date
	mutate(date = min(polls$start_date) + t - 1) %>%
	ggplot() +

	# Mark the democratic primary dates
	geom_vline(data = debates, aes(xintercept = date, color = "Democratic Debates"), lty = 2) +

	# 75% and 95% credibility intervals
	geom_ribbon(aes(x = date, y = posterior_median * 100, ymin = posterior_2.5 * 100, ymax = posterior_97.5 * 100),
	alpha = 0.25,
	fill = "#3498db") +
	geom_ribbon(aes(x = date, y = posterior_median * 100, ymin = posterior_25 * 100, ymax = posterior_75 * 100),
	alpha = 0.25,
	fill = "#3498db") +

	# Observed data points
	geom_point(data = polls, aes(x = start_date, y = pct), color = "#2c3e50", alpha = 1, shape = 19, size = 0.5) +

	# Posterior median
	geom_line(aes(x = date, y = posterior_median * 100), color = "#2980b9", size = 1) +

	# Separate panel for each candidate
	facet_wrap(~candidate, scales = "free") +

	# Formatting
	labs(x = "Date", y = "Poll result", caption = "Data: FiveThirtyEight", color = "") +
	scale_y_continuous(labels = function(x) paste0(x, "%")) +
	scale_x_date() +
	scale_color_manual(values = c("#8e44ad")) +
	scale_fill_manual(values = c("#8e44ad"))
	data {
	int<lower=0> T; // Number of timepoints
	int<lower=0> C; // Number of candidates
	int<lower=0> N; // Number of poll yervations

	int sample_size[N]; // Sample size of each poll
	int y[N]; // Number of respondents in poll for candidate (approximate)
	int<lower=1, upper=T> get_t_i[N]; // timepoint for ith observation
	int<lower=1, upper=C> get_c_i[N]; // candidate for ith observation
	}
	parameters {
	matrix[C, T] delta_logit; // Percent for candidate c at time t
	real<lower=0, upper=1> phi[N]; // Percent of participants in poll for candidate
	real<lower=0> tau; // Random walk variance
	real<lower=0,upper=0.5> sigma; // Overdispersion of observations
	}
	model {
	// Priors
	tau ~ normal(0, 0.2);
	sigma ~ normal(0, 1);

	// Random walk
	for(c in 1:C) {
	delta_logit[c, 2:T] ~ normal(delta_logit[c, 1:(T - 1)], tau);
	}

	// Observed data
	y ~ binomial(sample_size, phi);
	for(i in 1:N) {
	// Overdispersion
	delta_logit[get_c_i[i], get_t_i[i]] ~ normal(logit(phi[i]), sigma);
	}
	}
	generated quantities {
	matrix[C, T] delta = inv_logit(delta_logit);
	}