Skip to content

Instantly share code, notes, and snippets.

@herbps10 herbps10/primary_polling.R
Last active Aug 10, 2019

Embed
What would you like to do?
library(tidyverse)
library(extrafont)
library(rstan)
options(mc.cores = 4)
# Color scheme (flatuicolors.com)
flat_colors <- c(
gray = "#7f8c8d",
purple = "#8e44ad"
)
custom_theme <-
cowplot::theme_cowplot() +
theme(strip.background = element_rect(fill = "transparent"),
strip.text = element_text(color = "black", family = "Open Sans"),
axis.line = element_line(color = flat_colors['gray']),
axis.ticks = element_line(color = flat_colors['gray']),
axis.text = element_text(color = flat_colors['gray'], family = "Open Sans"),
axis.title = element_text(color = flat_colors['gray']),
legend.position = "bottom")
# Create a numeric index for each candidate
candidates <- tibble(candidate = c(
"Bernard Sanders",
"Beto O'Rourke",
"Elizabeth Warren",
"Joseph R. Biden Jr.",
"Kamala D. Harris",
"Pete Buttigieg"
)) %>% mutate(c = 1:n())
data_url <-
"https://projects.fivethirtyeight.com/polls-page/president_primary_polls.csv"
# Read poll data from FiveThirtyEight
polls <- read_csv(data_url, guess_max = 10000) %>%
rename(candidate = candidate_name) %>%
# Filter for national polls of 2020 Democraticprimary
filter(cycle == "2020", party == "DEM", is.na(state)) %>%
# Filter for subset of candidates
filter(candidate %in% candidates$candidate) %>%
# Parse poll start time into date
mutate(start_date = lubridate::mdy(start_date)) %>%
# Join in candidate index
left_join(candidates, by = c(candidate = "candidate")) %>%
# Create time point index
mutate(t = as.numeric(start_date - min(start_date)) + 1)
debates <- tribble(
~date,
"6/26/19",
"7/30/19"
) %>%
mutate(date = lubridate::mdy(date),
t = as.numeric(date - min(polls$start_date)) + 1)
ggplot(polls, aes(x = start_date, y = pct)) +
geom_point(size = 0.5) +
scale_x_date() +
geom_vline(data = debates, aes(xintercept = date), lty = 2) +
facet_wrap(~candidate) +
labs(x = "Date", y = "Poll result", caption = "Data: FiveThirtyEight", color = "") +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
scale_x_date() +
scale_color_manual(values = c("#8e44ad")) +
scale_fill_manual(values = c("#8e44ad")) +
custom_theme
stan_data <- list(
C = max(polls$c),
T = max(polls$t),
N = nrow(polls),
y = round((polls$pct / 100) * polls$sample_size),
sample_size = polls$sample_size,
get_t_i = polls$t,
get_c_i = polls$c,
get_p_i = polls$p
)
fit <- sampling(model, stan_data, iter = 1000, control = list(adapt_delta = 0.8, max_treedepth = 8))
delta_post <- tidybayes::spread_draws(fit, delta[c, t]) %>%
left_join(candidates)
delta_post_summarized <- delta_post %>%
group_by(c, candidate, t) %>%
summarize(posterior_median = quantile(delta, 0.5),
posterior_2.5 = quantile(delta, 0.025),
posterior_25 = quantile(delta, 0.25),
posterior_75 = quantile(delta, 0.75),
posterior_97.5 = quantile(delta, 0.975))
delta_post_summarized %>%
# Convert time index back into date
mutate(date = min(polls$start_date) + t - 1) %>%
ggplot() +
# Mark the democratic primary dates
geom_vline(data = debates, aes(xintercept = date, color = "Democratic Debates"), lty = 2) +
# 75% and 95% credibility intervals
geom_ribbon(aes(x = date, y = posterior_median * 100, ymin = posterior_2.5 * 100, ymax = posterior_97.5 * 100),
alpha = 0.25,
fill = "#3498db") +
geom_ribbon(aes(x = date, y = posterior_median * 100, ymin = posterior_25 * 100, ymax = posterior_75 * 100),
alpha = 0.25,
fill = "#3498db") +
# Observed data points
geom_point(data = polls, aes(x = start_date, y = pct), color = "#2c3e50", alpha = 1, shape = 19, size = 0.5) +
# Posterior median
geom_line(aes(x = date, y = posterior_median * 100), color = "#2980b9", size = 1) +
# Separate panel for each candidate
facet_wrap(~candidate, scales = "free") +
# Formatting
labs(x = "Date", y = "Poll result", caption = "Data: FiveThirtyEight", color = "") +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
scale_x_date() +
scale_color_manual(values = c("#8e44ad")) +
scale_fill_manual(values = c("#8e44ad"))
data {
int<lower=0> T; // Number of timepoints
int<lower=0> C; // Number of candidates
int<lower=0> N; // Number of poll yervations
int sample_size[N]; // Sample size of each poll
int y[N]; // Number of respondents in poll for candidate (approximate)
int<lower=1, upper=T> get_t_i[N]; // timepoint for ith observation
int<lower=1, upper=C> get_c_i[N]; // candidate for ith observation
}
parameters {
matrix[C, T] delta_logit; // Percent for candidate c at time t
real<lower=0, upper=1> phi[N]; // Percent of participants in poll for candidate
real<lower=0> tau; // Random walk variance
real<lower=0,upper=0.5> sigma; // Overdispersion of observations
}
model {
// Priors
tau ~ normal(0, 0.2);
sigma ~ normal(0, 1);
// Random walk
for(c in 1:C) {
delta_logit[c, 2:T] ~ normal(delta_logit[c, 1:(T - 1)], tau);
}
// Observed data
y ~ binomial(sample_size, phi);
for(i in 1:N) {
// Overdispersion
delta_logit[get_c_i[i], get_t_i[i]] ~ normal(logit(phi[i]), sigma);
}
}
generated quantities {
matrix[C, T] delta = inv_logit(delta_logit);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.