herbps10/primary_polls.R

## primary_polls.R
library(tidyverse)
library(rstan)
library(extrafont)

options(mc.cores = 4)


flat_colors <- c(
  asbestos = "#7f8c8d",
  purple = "#8e44ad"
)

my_theme <- theme(
  strip.background = element_rect(fill = "transparent"),
  strip.text = element_text(color = "black", family = "Open Sans"),
  axis.line = element_line(color = flat_colors['asbestos']),
  axis.ticks = element_line(color = flat_colors['asbestos']),
  axis.text = element_text(color = flat_colors['asbestos'], family = "Open Sans"),
  axis.title = element_text(color = flat_colors['asbestos']),
  legend.position = "bottom"
)

candidates <- tibble(candidate = c(
  #"Amy Klobuchar",
  #"Andrew Yang",
  "Bernard Sanders",
  "Beto O'Rourke",
  #"Bill de Blasio",
  #"Cory A. Booker",
  "Elizabeth Warren",
  #"Eric Swalwell",
  #"Jay Robert Inslee",
  #"John Hickenlooper",
  #"John K. Delaney",
  "Joseph R. Biden Jr.",
  #"Juli?n Castro",
  "Kamala D. Harris",
  #"Kirsten E. Gillibrand",
  #"Marianne Williamson",
  #"Michael F. Bennet",
  "Pete Buttigieg"
  #"Tim Ryan",
  #"Tulsi Gabbard"
)) %>% mutate(c = 1:n())


polls <- read_csv("https://projects.fivethirtyeight.com/polls-page/president_primary_polls.csv", guess_max = 100000) %>%
  filter(cycle == "2020", party == "DEM", is.na(state)) %>%
  dplyr::select(question_id, poll_id, start_date, sample_size, candidate_name, pct, pollster, fte_grade) %>%
  mutate(start_date = lubridate::mdy(start_date)) %>%
  rename(candidate = candidate_name) %>%
  filter(candidate %in% candidates$candidate) %>%
  #filter(lubridate::year(start_date) > 2018, lubridate::month(start_date) > 6) %>%
  left_join(candidates, by = c(candidate = "candidate")) %>%
  mutate(t = as.numeric(start_date - min(start_date)) + 1)

polls <- polls %>%
  left_join(pollsters)

debates <- tribble(
  ~date,
  "6/26/19",
  #"6/27/19",
  "7/30/19"
  #"7/31/19"
) %>%
  mutate(date = lubridate::mdy(date),
         t = as.numeric(date - min(polls$start_date)) + 1)

#
# Vectorized with shocks
#

shocks <- tribble(
  ~date,
  "6/27/19",
  "7/31/19"
) %>%
  mutate(date = lubridate::mdy(date),
         t = as.numeric(date - min(polls$start_date)) + 1)

C = max(polls$c)
T = max(polls$t)
N = nrow(polls)
obs = round((polls$pct / 100) * polls$sample_size)
sample_size = polls$sample_size

# Each timepoint should be correlated with the timepoint before it,
# by candidates
index1 <- matrix(1:(T*C), ncol = T, byrow = TRUE) %>%
  # Remove the shock timepoints
  .[, -(shocks$t + 1)] %>%
  # Remove the first timepoint
  .[, -1] %>%
  t() %>%
  as.vector()

index2 <- matrix(1:(T * C), ncol = T, byrow = TRUE) %>%
  # Remove the last timepoint
  .[, -T] %>%
  # Remove the shock timepoints
  .[, -shocks$t] %>%
  t() %>%
  as.vector()

# Y_logit[index1] should be correlated with Y_logit[index2]

obs_index <- polls$t + (polls$c - 1) * T

stan_data <- list(
  C = C,
  T = T,
  N = N,
  obs_index = obs_index,
  J = length(index1),
  index1 = index1,
  index2 = index2,
  sample_size = polls$sample_size,
  obs = round((polls$pct / 100) * polls$sample_size)
)

start.time2 <- Sys.time()
fit2 <- sampling(model2, stan_data, iter = 1000)
end.time2 <- Sys.time()

Y_post <- tidybayes::spread_draws(fit2, Y[i]) %>%
  mutate(t = ((i - 1) %% T + 1), c = ceiling(i / T))

Y_post_summarized <- Y_post %>%
  group_by(c, t) %>%
  summarize(posterior_median = quantile(Y, 0.5),
            posterior_2.5    = quantile(Y, 0.025),
            posterior_25    = quantile(Y, 0.25),
            posterior_75   = quantile(Y, 0.75),
            posterior_97.5   = quantile(Y, 0.975)) %>%
  mutate(date = min(polls$start_date) + t - 1) %>%
  left_join(candidates)

ggplot(Y_post_summarized, aes(x = date, y = posterior_median)) +
  geom_vline(data = debates, aes(xintercept = date, color = "Democratic Debates"), lty = 2) +
  geom_ribbon(aes(ymin = posterior_2.5, ymax = posterior_97.5), alpha = 0.2, fill = "#3498db") +
  geom_ribbon(aes(ymin = posterior_25, ymax = posterior_75), alpha = 0.2, fill = "#3498db") +
  geom_point(data = polls, aes(x = start_date, y = pct / 100), size = 0.5, color = "#2c3e50") +
  geom_line(color = "#2980b9", size = 1) +
  facet_wrap(~candidate) +
  labs(x = "Date", y = "Poll result", caption = "Data: FiveThirtyEight", color = "") +
  scale_y_continuous(labels = function(x) paste0(x * 100, "%")) +
  scale_x_date() +
  scale_color_manual(values = c("#8e44ad")) +
  scale_fill_manual(values = c("#8e44ad")) +
  cowplot::theme_cowplot() +
  my_theme

## random_walk_vectorized.stan
data {
  int<lower=0> T; // Number of timepoints
  int<lower=0> C; // Number of candidates
  int<lower=0> N; // Number of poll observations

  int sample_size[N]; // Sample size of each poll
  int obs[N]; // Number of respondents in poll for candidate (approximate)

  int obs_index[N];

  int J;
  int index1[J];
  int index2[J];
}
parameters {
  real Y_logit[T * C]; // Percent for candidate c at time t
  real<lower=0, upper=1> pct[N]; // Percent of participants in poll for candidate
  real<lower=0> tau; // Random walk variance
  real<lower=0,upper=0.5> sigma; // Overdispersion of observations
}
model {
  // Priors
  tau ~ normal(0, 0.2);
  sigma ~ normal(0, 1);

  // Random walk
  Y_logit[index1] ~ normal(Y_logit[index2], tau);

  // Observed data
  obs ~ binomial(sample_size, pct);
  Y_logit[obs_index] ~ normal(logit(pct), sigma);
}
generated quantities {
  real Y[C * T] = inv_logit(Y_logit);
}
	library(tidyverse)
	library(rstan)
	library(extrafont)

	options(mc.cores = 4)


	flat_colors <- c(
	asbestos = "#7f8c8d",
	purple = "#8e44ad"
	)

	my_theme <- theme(
	strip.background = element_rect(fill = "transparent"),
	strip.text = element_text(color = "black", family = "Open Sans"),
	axis.line = element_line(color = flat_colors['asbestos']),
	axis.ticks = element_line(color = flat_colors['asbestos']),
	axis.text = element_text(color = flat_colors['asbestos'], family = "Open Sans"),
	axis.title = element_text(color = flat_colors['asbestos']),
	legend.position = "bottom"
	)

	candidates <- tibble(candidate = c(
	#"Amy Klobuchar",
	#"Andrew Yang",
	"Bernard Sanders",
	"Beto O'Rourke",
	#"Bill de Blasio",
	#"Cory A. Booker",
	"Elizabeth Warren",
	#"Eric Swalwell",
	#"Jay Robert Inslee",
	#"John Hickenlooper",
	#"John K. Delaney",
	"Joseph R. Biden Jr.",
	#"Juli?n Castro",
	"Kamala D. Harris",
	#"Kirsten E. Gillibrand",
	#"Marianne Williamson",
	#"Michael F. Bennet",
	"Pete Buttigieg"
	#"Tim Ryan",
	#"Tulsi Gabbard"
	)) %>% mutate(c = 1:n())


	polls <- read_csv("https://projects.fivethirtyeight.com/polls-page/president_primary_polls.csv", guess_max = 100000) %>%
	filter(cycle == "2020", party == "DEM", is.na(state)) %>%
	dplyr::select(question_id, poll_id, start_date, sample_size, candidate_name, pct, pollster, fte_grade) %>%
	mutate(start_date = lubridate::mdy(start_date)) %>%
	rename(candidate = candidate_name) %>%
	filter(candidate %in% candidates$candidate) %>%
	#filter(lubridate::year(start_date) > 2018, lubridate::month(start_date) > 6) %>%
	left_join(candidates, by = c(candidate = "candidate")) %>%
	mutate(t = as.numeric(start_date - min(start_date)) + 1)

	polls <- polls %>%
	left_join(pollsters)

	debates <- tribble(
	~date,
	"6/26/19",
	#"6/27/19",
	"7/30/19"
	#"7/31/19"
	) %>%
	mutate(date = lubridate::mdy(date),
	t = as.numeric(date - min(polls$start_date)) + 1)

	#
	# Vectorized with shocks
	#

	shocks <- tribble(
	~date,
	"6/27/19",
	"7/31/19"
	) %>%
	mutate(date = lubridate::mdy(date),
	t = as.numeric(date - min(polls$start_date)) + 1)

	C = max(polls$c)
	T = max(polls$t)
	N = nrow(polls)
	obs = round((polls$pct / 100) * polls$sample_size)
	sample_size = polls$sample_size

	# Each timepoint should be correlated with the timepoint before it,
	# by candidates
	index1 <- matrix(1:(T*C), ncol = T, byrow = TRUE) %>%
	# Remove the shock timepoints
	.[, -(shocks$t + 1)] %>%
	# Remove the first timepoint
	.[, -1] %>%
	t() %>%
	as.vector()

	index2 <- matrix(1:(T * C), ncol = T, byrow = TRUE) %>%
	# Remove the last timepoint
	.[, -T] %>%
	# Remove the shock timepoints
	.[, -shocks$t] %>%
	t() %>%
	as.vector()

	# Y_logit[index1] should be correlated with Y_logit[index2]

	obs_index <- polls$t + (polls$c - 1) * T

	stan_data <- list(
	C = C,
	T = T,
	N = N,
	obs_index = obs_index,
	J = length(index1),
	index1 = index1,
	index2 = index2,
	sample_size = polls$sample_size,
	obs = round((polls$pct / 100) * polls$sample_size)
	)

	start.time2 <- Sys.time()
	fit2 <- sampling(model2, stan_data, iter = 1000)
	end.time2 <- Sys.time()

	Y_post <- tidybayes::spread_draws(fit2, Y[i]) %>%
	mutate(t = ((i - 1) %% T + 1), c = ceiling(i / T))

	Y_post_summarized <- Y_post %>%
	group_by(c, t) %>%
	summarize(posterior_median = quantile(Y, 0.5),
	posterior_2.5 = quantile(Y, 0.025),
	posterior_25 = quantile(Y, 0.25),
	posterior_75 = quantile(Y, 0.75),
	posterior_97.5 = quantile(Y, 0.975)) %>%
	mutate(date = min(polls$start_date) + t - 1) %>%
	left_join(candidates)

	ggplot(Y_post_summarized, aes(x = date, y = posterior_median)) +
	geom_vline(data = debates, aes(xintercept = date, color = "Democratic Debates"), lty = 2) +
	geom_ribbon(aes(ymin = posterior_2.5, ymax = posterior_97.5), alpha = 0.2, fill = "#3498db") +
	geom_ribbon(aes(ymin = posterior_25, ymax = posterior_75), alpha = 0.2, fill = "#3498db") +
	geom_point(data = polls, aes(x = start_date, y = pct / 100), size = 0.5, color = "#2c3e50") +
	geom_line(color = "#2980b9", size = 1) +
	facet_wrap(~candidate) +
	labs(x = "Date", y = "Poll result", caption = "Data: FiveThirtyEight", color = "") +
	scale_y_continuous(labels = function(x) paste0(x * 100, "%")) +
	scale_x_date() +
	scale_color_manual(values = c("#8e44ad")) +
	scale_fill_manual(values = c("#8e44ad")) +
	cowplot::theme_cowplot() +
	my_theme
	data {
	int<lower=0> T; // Number of timepoints
	int<lower=0> C; // Number of candidates
	int<lower=0> N; // Number of poll observations

	int sample_size[N]; // Sample size of each poll
	int obs[N]; // Number of respondents in poll for candidate (approximate)

	int obs_index[N];

	int J;
	int index1[J];
	int index2[J];
	}
	parameters {
	real Y_logit[T * C]; // Percent for candidate c at time t
	real<lower=0, upper=1> pct[N]; // Percent of participants in poll for candidate
	real<lower=0> tau; // Random walk variance
	real<lower=0,upper=0.5> sigma; // Overdispersion of observations
	}
	model {
	// Priors
	tau ~ normal(0, 0.2);
	sigma ~ normal(0, 1);

	// Random walk
	Y_logit[index1] ~ normal(Y_logit[index2], tau);

	// Observed data
	obs ~ binomial(sample_size, pct);
	Y_logit[obs_index] ~ normal(logit(pct), sigma);
	}
	generated quantities {
	real Y[C * T] = inv_logit(Y_logit);
	}