Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
California 2021 recall polling model
end_date pollster n population pid_weighted keep remove
2021-09-13 survey monkey 3985 LV 1 0.55 0.41
2021-09-13 trafalgar 1082 LV 0 0.53 0.45
2021-09-11 emerson 1000 LV 1 0.6 0.4
2021-09-10 data for progress 2464 LV 1 0.57 0.43
2021-09-08 surveyusa 930 LV 0 0.54 0.41
2021-09-07 suffolk 500 LV 0 0.58 0.41
2021-09-06 uc berkeley 6550 LV 0 0.6 0.39
2021-09-04 trafalgar 1079 LV 0 0.53 0.43
2021-09-01 yougov 1955 LV 1 0.56 0.44
2021-08-29 trafalgar 1088 LV 0 0.52 0.44
2021-08-29 public policy institute of california 1080 LV 0 0.58 0.39
2021-08-28 surveyusa 816 LV 0 0.51 0.43
2021-08-27 gravis marketing 729 LV 0 0.49 0.44
2021-08-25 targoz 787 LV 1 0.52 0.42
2021-08-25 change research 782 LV 1 0.57 0.42
2021-08-22 redfield & wilton 1000 LV 1 0.51 0.43
2021-08-12 yougov 1534 LV 1 0.52 0.48
2021-08-04 surveyusa 613 LV 0 0.4 0.51
2021-08-21 emerson 1000 LV 0 0.48 0.46
2021-08-29 core decision analytics 731 LV 0 0.51 0.44
2021-08-24 uc berkeley 2783 LV 0 0.5 0.47
2021-07-20 emerson 1085 RV 0 0.48 0.43
2021-06-16 change research 1085 RV 1 0.54 0.4
2021-06-03 moore information 682 LV 0 0.46 0.49
2021-05-30 tulchin research 1168 LV 0 0.52 0.38
2021-05-18 public policy institute of california 1074 LV 0 0.57 0.4
2021-05-05 uc berkeley 10289 RV 0 0.49 0.36
2021-05-02 surveyusa 642 RV 0 0.47 0.36
library(tidyverse)
library(lubridate)
library(brms)
polls = read_csv('~/Downloads/ca_polls - Sheet1.csv') %>%
mutate(end_date = ymd(end_date),
end_date.n = as.numeric(end_date),
weight = sqrt(n/mean(n)))
polls2 = polls %>%
mutate(undecided = pmax(0.1, (1 - (keep + remove))),
total = keep + remove,
keep = keep / total,
remove = remove / total)
polls = polls %>%
mutate(outcome = (polls %>%
select(keep,remove) %>%
mutate(undecided = pmax(0.01, (1 - (keep + remove))),
total = keep + remove + undecided) %>%
mutate(keep = keep / total,
remove = remove / total,
undecided = undecided / total) %>%
select(-total) %>%
as.matrix()))
# dirichlet models --------------------------------------------------------
model = brm(outcome ~ poly(end_date.n,degree=4) + (1|pollster) + pid_weighted,
data = polls,
family = dirichlet(refcat='undecided'),
prior = c(
set_prior(prior='normal(75, 10)',class='phi'),
set_prior(prior = 'normal(2.0,0.1)', class = 'Intercept', dpar = 'mukeep'),
set_prior(prior = 'normal(1.9,0.1)', class = 'Intercept', dpar = 'muremove'),
set_prior(prior = 'normal(0.23,0.1)', class = 'b', coef = 'pid_weighted',dpar = 'mukeep'),
set_prior(prior = 'normal(0.07,0.1)', class = 'b', coef = 'pid_weighted',dpar = 'muremove'),
set_prior(prior = 'normal(1.26,0.5)', class = 'b', coef = 'polyend_date.ndegreeEQ41',dpar = 'mukeep'),
set_prior(prior = 'normal(-0.18,0.5)', class = 'b', coef = 'polyend_date.ndegreeEQ42',dpar = 'mukeep'),
set_prior(prior = 'normal(0.52,0.5)', class = 'b', coef = 'polyend_date.ndegreeEQ43',dpar = 'mukeep'),
set_prior(prior = 'normal(-0.63,0.5)', class = 'b', coef = 'polyend_date.ndegreeEQ44',dpar = 'mukeep'),
set_prior(prior = 'normal(1.55,0.5)', class = 'b', coef = 'polyend_date.ndegreeEQ41',dpar = 'muremove'),
set_prior(prior = 'normal(-0.50,0.5)', class = 'b', coef = 'polyend_date.ndegreeEQ42',dpar = 'muremove'),
set_prior(prior = 'normal(0.38,0.5)', class = 'b', coef = 'polyend_date.ndegreeEQ43',dpar = 'muremove'),
set_prior(prior = 'normal(-0.62,0.5)', class = 'b', coef = 'polyend_date.ndegreeEQ44',dpar = 'muremove'),
set_prior(prior = 'normal(0,0.05)', class = 'sd',dpar = 'mukeep'),
set_prior(prior = 'normal(0,0.05)', class = 'sd',dpar = 'muremove')
),
backend = 'cmdstan',
chains = 4, cores = 4,
warmup = 1000, iter = 50000,
control = list(adapt_delta = 0.99, max_treedepth = 11))
model
model2 = brm(outcome ~ end_date.n + s(end_date.n) + (1|pollster) + pid_weighted,
data = polls,
family = dirichlet(refcat='undecided'),
prior = c(
set_prior(prior='normal(75, 10)',class='phi'),
set_prior(prior = 'normal(2.0,0.1)', class = 'Intercept', dpar = 'mukeep'),
set_prior(prior = 'normal(1.9,0.1)', class = 'Intercept', dpar = 'muremove'),
set_prior(prior = 'normal(0.23,0.1)', class = 'b', coef = 'pid_weighted',dpar = 'mukeep'),
set_prior(prior = 'normal(0.07,0.1)', class = 'b', coef = 'pid_weighted',dpar = 'muremove'),
set_prior(prior = 'normal(0,0.01)', class = 'b', coef = 'end_date.n',dpar = 'mukeep'),
set_prior(prior = 'normal(0,0.01)', class = 'b', coef = 'end_date.n',dpar = 'muremove'),
set_prior(prior = 'normal(0,0.05)', class = 'sd',dpar = 'mukeep'),
set_prior(prior = 'normal(0,0.05)', class = 'sd',dpar = 'muremove'),
set_prior(prior = 'normal(0,1.5)', class = 'sds',dpar = 'mukeep'),
set_prior(prior = 'normal(0,1.5)', class = 'sds',dpar = 'muremove')
),
backend = 'cmdstan',
chains = 4, cores = 4,
warmup = 1000, iter = 50000,
control = list(adapt_delta = 0.99, max_treedepth = 11))
model2
# extract and graph
predictions_df = tibble(end_date.n = seq(min(polls$end_date.n),max(polls$end_date.n),1),
pid_weighted = 1) %>%
bind_rows( tibble(end_date.n = seq(min(polls$end_date.n),max(polls$end_date.n),1),
pid_weighted = 0) )
predictions_out = predict(object = model2,
newdata = predictions_df,
re_formula = NA)
predictions_df$keep_mu = predictions_out[,,'keep'][,1]
predictions_df$keep_se = predictions_out[,,'keep'][,2]
predictions_df$remove_mu = predictions_out[,,'remove'][,1]
predictions_df$remove_se = predictions_out[,,'remove'][,2]
# plot
predictions_df = predictions_df %>%
left_join(polls %>%
select(-pid_weighted), by = c("end_date.n")) %>%
mutate(end_date = as_date(end_date.n)) %>%
mutate(pid_weighted = ifelse(pid_weighted == 1, 'Weighted by party/past vote','Not weighted by party/past vote'),
undecided = 0)
ggplot(predictions_df, aes(x=end_date)) +
# polls as points
geom_point(aes(y = keep,col='Keep Newsom'),shape=1, show.legend=F) +
geom_point(aes(y = remove,col='Remove'),shape=1, show.legend=F) +
# trends as lines
geom_line(aes(y = keep_mu,col='Keep Newsom'), show.legend=F) +
geom_line(aes(y = remove_mu,col='Remove'), show.legend=F) +
# uncertainty intervals
geom_ribbon(aes(ymax = keep_mu + keep_se*1.65,
ymin = keep_mu - keep_se*1.65,fill='Keep Newsom'),
col = NA, alpha=0.3) +
geom_ribbon(aes(ymax = remove_mu + remove_se*1.65,
ymin = remove_mu - remove_se*1.65,fill='Remove'),
col = NA, alpha=0.3) +
# label
geom_label(data = predictions_df %>% filter(end_date == max(end_date)),
aes(x = end_date + 5, y = keep_mu, label = paste0(round(keep_mu*100),'%'), col = 'Keep Newsom'),show.legend = F) +
geom_label(data = predictions_df %>% filter(end_date == max(end_date)),
aes(x = end_date + 5, y = remove_mu, label = paste0(round(remove_mu*100),'%'), col = 'Remove'),show.legend = F) +
# settings
facet_wrap(~pid_weighted) +
scale_y_continuous(breaks=seq(0,1,0.05),labels=function(x){round(x*100)}) +
theme_minimal() +
theme(legend.position = 'top',
panel.grid.minor = element_blank()) +
labs(x='Date',
y='%',
col = "",
fill = '',
subtitle = "Polling average for the 2021 California recall election if you adjust polls for potential partisan non-response*\nNot allocating undecideds",
caption = '*The adjustment model takes the systematic difference in party-weighted and non-party-weighted polls and adjusts unweighted data')
# normal model --------------------------------------------------------------
polls2 = polls2 %>% mutate(pid_weighted.fac = factor(pid_weighted))
model3 = brm(keep ~ s(end_date.n, k=5) + (1|pollster) + pid_weighted + undecided + s(end_date.n, by=pid_weighted.fac, k=5),
data = polls2,
family = 'beta',
prior = c(
set_prior(prior='normal(350, 100)',class='phi'),
set_prior(prior = 'normal(0,0.01)', class = 'sd'),
set_prior(prior = 'normal(0.2,0.1)', class = 'Intercept'),
set_prior(prior = 'normal(0.10,0.05)', class = 'b', coef = 'pid_weighted'),
set_prior(prior = 'normal(-0.02,0.1)', class = 'b', coef = 'undecided'),
set_prior(prior = 'normal(0.4,0.7)', class = 'b', coef = 'send_date.n_1'),
set_prior(prior = 'normal(1.5,0.7)', class = 'b', coef = 'send_date.n:pid_weighted.fac0_1'),
set_prior(prior = 'normal(1.0,0.7)', class = 'b', coef = 'send_date.n:pid_weighted.fac1_1'),
set_prior(prior = 'normal(1.0,0.5)', class = 'sds', coef = 's(end_date.n, k = 5)'),
set_prior(prior = 'normal(1.5,0.6)', class = 'sds', coef = 's(end_date.n, by = pid_weighted.fac, k = 5)')
),
backend = 'cmdstan',
chains = 4, cores = 4,
warmup = 1000, iter = 20000,
control = list(adapt_delta = 0.99, max_treedepth = 11))
model3
# get predictions
predictions_df2 =
tibble(end_date.n = seq(min(polls$end_date.n),max(polls$end_date.n),1),
pid_weighted = 1) %>%
bind_rows( tibble(end_date.n = seq(min(polls$end_date.n),max(polls$end_date.n),1),
pid_weighted = 0) ) %>%
mutate(undecided = 0,
pid_weighted.fac = factor(pid_weighted))
predictions_out2 = predict(object = model3,
newdata = predictions_df2,
re_formula = NA)
predictions_df2$keep_mu = predictions_out2[,1]
predictions_df2$keep_se = predictions_out2[,2]
predictions_df2$remove_mu = 1 - predictions_df2$keep_mu
predictions_df2$remove_se = predictions_df2$keep_se
# plot
predictions_df2 = predictions_df2 %>%
left_join(polls %>%
select(-pid_weighted), by = c("end_date.n")) %>%
mutate(end_date = as_date(end_date.n)) %>%
mutate(pid_weighted = ifelse(pid_weighted == 1, 'Weighted by party/past vote','Not weighted by party/past vote'),
undecided = 0) %>%
mutate(tot = keep + remove,
keep = keep/tot,
remove = remove / tot)
ggplot(predictions_df2, aes(x=end_date)) +
# polls as points
geom_point(aes(y = keep,col='Keep Newsom'),shape=1, show.legend=F) +
geom_point(aes(y = remove,col='Remove'),shape=1, show.legend=F) +
# trends as lines
geom_line(aes(y = keep_mu,col='Keep Newsom'), show.legend=F) +
geom_line(aes(y = remove_mu,col='Remove'), show.legend=F) +
# uncertainty intervals
geom_ribbon(aes(ymax = keep_mu + keep_se*1.65,
ymin = keep_mu - keep_se*1.65,fill='Keep Newsom'),
col = NA, alpha=0.3) +
geom_ribbon(aes(ymax = remove_mu + remove_se*1.65,
ymin = remove_mu - remove_se*1.65,fill='Remove'),
col = NA, alpha=0.3) +
# label
geom_label(data = predictions_df2 %>% filter(end_date == max(end_date)),
aes(x = end_date + 5, y = keep_mu, label = paste0(round(keep_mu*100),'%'), col = 'Keep Newsom'),show.legend = F) +
geom_label(data = predictions_df2 %>% filter(end_date == max(end_date)),
aes(x = end_date + 5, y = remove_mu, label = paste0(round(remove_mu*100),'%'), col = 'Remove'),show.legend = F) +
# settings
facet_wrap(~pid_weighted) +
scale_y_continuous(breaks=seq(0,1,0.05),labels=function(x){round(x*100)}) +
theme_minimal() +
theme(legend.position = 'top',
panel.grid.minor = element_blank()) +
labs(x='Date',
y='%',
col = "",
fill = '',
subtitle = "Polling average for the 2021 California recall election if you adjust polls for potential partisan non-response*\nLetting the model allocate undecideds",
caption = '*The adjustment model takes the systematic difference in party-weighted and non-party-weighted polls and adjusts unweighted data')
predict(object = model3,
newdata = tibble(end_date.n = seq(min(polls$end_date.n),max(polls$end_date.n),1),
pid_weighted = 1,
undecided = 0) %>%
mutate(pid_weighted.fac = factor(pid_weighted)),
re_formula = NA,
type = 'response')
beepr::beep(2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment