Skip to content

Instantly share code, notes, and snippets.

@herbps10
Created August 16, 2019 19:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save herbps10/09d41d67a6839c966a7aba0a5f343b58 to your computer and use it in GitHub Desktop.
Save herbps10/09d41d67a6839c966a7aba0a5f343b58 to your computer and use it in GitHub Desktop.
library(tidyverse)
library(rstan)
library(extrafont)
options(mc.cores = 4)
flat_colors <- c(
asbestos = "#7f8c8d",
purple = "#8e44ad"
)
my_theme <- theme(
strip.background = element_rect(fill = "transparent"),
strip.text = element_text(color = "black", family = "Open Sans"),
axis.line = element_line(color = flat_colors['asbestos']),
axis.ticks = element_line(color = flat_colors['asbestos']),
axis.text = element_text(color = flat_colors['asbestos'], family = "Open Sans"),
axis.title = element_text(color = flat_colors['asbestos']),
legend.position = "bottom"
)
candidates <- tibble(candidate = c(
#"Amy Klobuchar",
#"Andrew Yang",
"Bernard Sanders",
"Beto O'Rourke",
#"Bill de Blasio",
#"Cory A. Booker",
"Elizabeth Warren",
#"Eric Swalwell",
#"Jay Robert Inslee",
#"John Hickenlooper",
#"John K. Delaney",
"Joseph R. Biden Jr.",
#"Juli?n Castro",
"Kamala D. Harris",
#"Kirsten E. Gillibrand",
#"Marianne Williamson",
#"Michael F. Bennet",
"Pete Buttigieg"
#"Tim Ryan",
#"Tulsi Gabbard"
)) %>% mutate(c = 1:n())
polls <- read_csv("https://projects.fivethirtyeight.com/polls-page/president_primary_polls.csv", guess_max = 100000) %>%
filter(cycle == "2020", party == "DEM", is.na(state)) %>%
dplyr::select(question_id, poll_id, start_date, sample_size, candidate_name, pct, pollster, fte_grade) %>%
mutate(start_date = lubridate::mdy(start_date)) %>%
rename(candidate = candidate_name) %>%
filter(candidate %in% candidates$candidate) %>%
#filter(lubridate::year(start_date) > 2018, lubridate::month(start_date) > 6) %>%
left_join(candidates, by = c(candidate = "candidate")) %>%
mutate(t = as.numeric(start_date - min(start_date)) + 1)
polls <- polls %>%
left_join(pollsters)
debates <- tribble(
~date,
"6/26/19",
#"6/27/19",
"7/30/19"
#"7/31/19"
) %>%
mutate(date = lubridate::mdy(date),
t = as.numeric(date - min(polls$start_date)) + 1)
#
# Vectorized with shocks
#
shocks <- tribble(
~date,
"6/27/19",
"7/31/19"
) %>%
mutate(date = lubridate::mdy(date),
t = as.numeric(date - min(polls$start_date)) + 1)
C = max(polls$c)
T = max(polls$t)
N = nrow(polls)
obs = round((polls$pct / 100) * polls$sample_size)
sample_size = polls$sample_size
# Each timepoint should be correlated with the timepoint before it,
# by candidates
index1 <- matrix(1:(T*C), ncol = T, byrow = TRUE) %>%
# Remove the shock timepoints
.[, -(shocks$t + 1)] %>%
# Remove the first timepoint
.[, -1] %>%
t() %>%
as.vector()
index2 <- matrix(1:(T * C), ncol = T, byrow = TRUE) %>%
# Remove the last timepoint
.[, -T] %>%
# Remove the shock timepoints
.[, -shocks$t] %>%
t() %>%
as.vector()
# Y_logit[index1] should be correlated with Y_logit[index2]
obs_index <- polls$t + (polls$c - 1) * T
stan_data <- list(
C = C,
T = T,
N = N,
obs_index = obs_index,
J = length(index1),
index1 = index1,
index2 = index2,
sample_size = polls$sample_size,
obs = round((polls$pct / 100) * polls$sample_size)
)
start.time2 <- Sys.time()
fit2 <- sampling(model2, stan_data, iter = 1000)
end.time2 <- Sys.time()
Y_post <- tidybayes::spread_draws(fit2, Y[i]) %>%
mutate(t = ((i - 1) %% T + 1), c = ceiling(i / T))
Y_post_summarized <- Y_post %>%
group_by(c, t) %>%
summarize(posterior_median = quantile(Y, 0.5),
posterior_2.5 = quantile(Y, 0.025),
posterior_25 = quantile(Y, 0.25),
posterior_75 = quantile(Y, 0.75),
posterior_97.5 = quantile(Y, 0.975)) %>%
mutate(date = min(polls$start_date) + t - 1) %>%
left_join(candidates)
ggplot(Y_post_summarized, aes(x = date, y = posterior_median)) +
geom_vline(data = debates, aes(xintercept = date, color = "Democratic Debates"), lty = 2) +
geom_ribbon(aes(ymin = posterior_2.5, ymax = posterior_97.5), alpha = 0.2, fill = "#3498db") +
geom_ribbon(aes(ymin = posterior_25, ymax = posterior_75), alpha = 0.2, fill = "#3498db") +
geom_point(data = polls, aes(x = start_date, y = pct / 100), size = 0.5, color = "#2c3e50") +
geom_line(color = "#2980b9", size = 1) +
facet_wrap(~candidate) +
labs(x = "Date", y = "Poll result", caption = "Data: FiveThirtyEight", color = "") +
scale_y_continuous(labels = function(x) paste0(x * 100, "%")) +
scale_x_date() +
scale_color_manual(values = c("#8e44ad")) +
scale_fill_manual(values = c("#8e44ad")) +
cowplot::theme_cowplot() +
my_theme
data {
int<lower=0> T; // Number of timepoints
int<lower=0> C; // Number of candidates
int<lower=0> N; // Number of poll observations
int sample_size[N]; // Sample size of each poll
int obs[N]; // Number of respondents in poll for candidate (approximate)
int obs_index[N];
int J;
int index1[J];
int index2[J];
}
parameters {
real Y_logit[T * C]; // Percent for candidate c at time t
real<lower=0, upper=1> pct[N]; // Percent of participants in poll for candidate
real<lower=0> tau; // Random walk variance
real<lower=0,upper=0.5> sigma; // Overdispersion of observations
}
model {
// Priors
tau ~ normal(0, 0.2);
sigma ~ normal(0, 1);
// Random walk
Y_logit[index1] ~ normal(Y_logit[index2], tau);
// Observed data
obs ~ binomial(sample_size, pct);
Y_logit[obs_index] ~ normal(logit(pct), sigma);
}
generated quantities {
real Y[C * T] = inv_logit(Y_logit);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment