California 2021 recall polling model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
end_date | pollster | n | population | pid_weighted | keep | remove | |
---|---|---|---|---|---|---|---|
2021-09-13 | survey monkey | 3985 | LV | 1 | 0.55 | 0.41 | |
2021-09-13 | trafalgar | 1082 | LV | 0 | 0.53 | 0.45 | |
2021-09-11 | emerson | 1000 | LV | 1 | 0.6 | 0.4 | |
2021-09-10 | data for progress | 2464 | LV | 1 | 0.57 | 0.43 | |
2021-09-08 | surveyusa | 930 | LV | 0 | 0.54 | 0.41 | |
2021-09-07 | suffolk | 500 | LV | 0 | 0.58 | 0.41 | |
2021-09-06 | uc berkeley | 6550 | LV | 0 | 0.6 | 0.39 | |
2021-09-04 | trafalgar | 1079 | LV | 0 | 0.53 | 0.43 | |
2021-09-01 | yougov | 1955 | LV | 1 | 0.56 | 0.44 | |
2021-08-29 | trafalgar | 1088 | LV | 0 | 0.52 | 0.44 | |
2021-08-29 | public policy institute of california | 1080 | LV | 0 | 0.58 | 0.39 | |
2021-08-28 | surveyusa | 816 | LV | 0 | 0.51 | 0.43 | |
2021-08-27 | gravis marketing | 729 | LV | 0 | 0.49 | 0.44 | |
2021-08-25 | targoz | 787 | LV | 1 | 0.52 | 0.42 | |
2021-08-25 | change research | 782 | LV | 1 | 0.57 | 0.42 | |
2021-08-22 | redfield & wilton | 1000 | LV | 1 | 0.51 | 0.43 | |
2021-08-12 | yougov | 1534 | LV | 1 | 0.52 | 0.48 | |
2021-08-04 | surveyusa | 613 | LV | 0 | 0.4 | 0.51 | |
2021-08-21 | emerson | 1000 | LV | 0 | 0.48 | 0.46 | |
2021-08-29 | core decision analytics | 731 | LV | 0 | 0.51 | 0.44 | |
2021-08-24 | uc berkeley | 2783 | LV | 0 | 0.5 | 0.47 | |
2021-07-20 | emerson | 1085 | RV | 0 | 0.48 | 0.43 | |
2021-06-16 | change research | 1085 | RV | 1 | 0.54 | 0.4 | |
2021-06-03 | moore information | 682 | LV | 0 | 0.46 | 0.49 | |
2021-05-30 | tulchin research | 1168 | LV | 0 | 0.52 | 0.38 | |
2021-05-18 | public policy institute of california | 1074 | LV | 0 | 0.57 | 0.4 | |
2021-05-05 | uc berkeley | 10289 | RV | 0 | 0.49 | 0.36 | |
2021-05-02 | surveyusa | 642 | RV | 0 | 0.47 | 0.36 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(lubridate) | |
library(brms) | |
polls = read_csv('~/Downloads/ca_polls - Sheet1.csv') %>% | |
mutate(end_date = ymd(end_date), | |
end_date.n = as.numeric(end_date), | |
weight = sqrt(n/mean(n))) | |
polls2 = polls %>% | |
mutate(undecided = pmax(0.1, (1 - (keep + remove))), | |
total = keep + remove, | |
keep = keep / total, | |
remove = remove / total) | |
polls = polls %>% | |
mutate(outcome = (polls %>% | |
select(keep,remove) %>% | |
mutate(undecided = pmax(0.01, (1 - (keep + remove))), | |
total = keep + remove + undecided) %>% | |
mutate(keep = keep / total, | |
remove = remove / total, | |
undecided = undecided / total) %>% | |
select(-total) %>% | |
as.matrix())) | |
# dirichlet models -------------------------------------------------------- | |
model = brm(outcome ~ poly(end_date.n,degree=4) + (1|pollster) + pid_weighted, | |
data = polls, | |
family = dirichlet(refcat='undecided'), | |
prior = c( | |
set_prior(prior='normal(75, 10)',class='phi'), | |
set_prior(prior = 'normal(2.0,0.1)', class = 'Intercept', dpar = 'mukeep'), | |
set_prior(prior = 'normal(1.9,0.1)', class = 'Intercept', dpar = 'muremove'), | |
set_prior(prior = 'normal(0.23,0.1)', class = 'b', coef = 'pid_weighted',dpar = 'mukeep'), | |
set_prior(prior = 'normal(0.07,0.1)', class = 'b', coef = 'pid_weighted',dpar = 'muremove'), | |
set_prior(prior = 'normal(1.26,0.5)', class = 'b', coef = 'polyend_date.ndegreeEQ41',dpar = 'mukeep'), | |
set_prior(prior = 'normal(-0.18,0.5)', class = 'b', coef = 'polyend_date.ndegreeEQ42',dpar = 'mukeep'), | |
set_prior(prior = 'normal(0.52,0.5)', class = 'b', coef = 'polyend_date.ndegreeEQ43',dpar = 'mukeep'), | |
set_prior(prior = 'normal(-0.63,0.5)', class = 'b', coef = 'polyend_date.ndegreeEQ44',dpar = 'mukeep'), | |
set_prior(prior = 'normal(1.55,0.5)', class = 'b', coef = 'polyend_date.ndegreeEQ41',dpar = 'muremove'), | |
set_prior(prior = 'normal(-0.50,0.5)', class = 'b', coef = 'polyend_date.ndegreeEQ42',dpar = 'muremove'), | |
set_prior(prior = 'normal(0.38,0.5)', class = 'b', coef = 'polyend_date.ndegreeEQ43',dpar = 'muremove'), | |
set_prior(prior = 'normal(-0.62,0.5)', class = 'b', coef = 'polyend_date.ndegreeEQ44',dpar = 'muremove'), | |
set_prior(prior = 'normal(0,0.05)', class = 'sd',dpar = 'mukeep'), | |
set_prior(prior = 'normal(0,0.05)', class = 'sd',dpar = 'muremove') | |
), | |
backend = 'cmdstan', | |
chains = 4, cores = 4, | |
warmup = 1000, iter = 50000, | |
control = list(adapt_delta = 0.99, max_treedepth = 11)) | |
model | |
model2 = brm(outcome ~ end_date.n + s(end_date.n) + (1|pollster) + pid_weighted, | |
data = polls, | |
family = dirichlet(refcat='undecided'), | |
prior = c( | |
set_prior(prior='normal(75, 10)',class='phi'), | |
set_prior(prior = 'normal(2.0,0.1)', class = 'Intercept', dpar = 'mukeep'), | |
set_prior(prior = 'normal(1.9,0.1)', class = 'Intercept', dpar = 'muremove'), | |
set_prior(prior = 'normal(0.23,0.1)', class = 'b', coef = 'pid_weighted',dpar = 'mukeep'), | |
set_prior(prior = 'normal(0.07,0.1)', class = 'b', coef = 'pid_weighted',dpar = 'muremove'), | |
set_prior(prior = 'normal(0,0.01)', class = 'b', coef = 'end_date.n',dpar = 'mukeep'), | |
set_prior(prior = 'normal(0,0.01)', class = 'b', coef = 'end_date.n',dpar = 'muremove'), | |
set_prior(prior = 'normal(0,0.05)', class = 'sd',dpar = 'mukeep'), | |
set_prior(prior = 'normal(0,0.05)', class = 'sd',dpar = 'muremove'), | |
set_prior(prior = 'normal(0,1.5)', class = 'sds',dpar = 'mukeep'), | |
set_prior(prior = 'normal(0,1.5)', class = 'sds',dpar = 'muremove') | |
), | |
backend = 'cmdstan', | |
chains = 4, cores = 4, | |
warmup = 1000, iter = 50000, | |
control = list(adapt_delta = 0.99, max_treedepth = 11)) | |
model2 | |
# extract and graph | |
predictions_df = tibble(end_date.n = seq(min(polls$end_date.n),max(polls$end_date.n),1), | |
pid_weighted = 1) %>% | |
bind_rows( tibble(end_date.n = seq(min(polls$end_date.n),max(polls$end_date.n),1), | |
pid_weighted = 0) ) | |
predictions_out = predict(object = model2, | |
newdata = predictions_df, | |
re_formula = NA) | |
predictions_df$keep_mu = predictions_out[,,'keep'][,1] | |
predictions_df$keep_se = predictions_out[,,'keep'][,2] | |
predictions_df$remove_mu = predictions_out[,,'remove'][,1] | |
predictions_df$remove_se = predictions_out[,,'remove'][,2] | |
# plot | |
predictions_df = predictions_df %>% | |
left_join(polls %>% | |
select(-pid_weighted), by = c("end_date.n")) %>% | |
mutate(end_date = as_date(end_date.n)) %>% | |
mutate(pid_weighted = ifelse(pid_weighted == 1, 'Weighted by party/past vote','Not weighted by party/past vote'), | |
undecided = 0) | |
ggplot(predictions_df, aes(x=end_date)) + | |
# polls as points | |
geom_point(aes(y = keep,col='Keep Newsom'),shape=1, show.legend=F) + | |
geom_point(aes(y = remove,col='Remove'),shape=1, show.legend=F) + | |
# trends as lines | |
geom_line(aes(y = keep_mu,col='Keep Newsom'), show.legend=F) + | |
geom_line(aes(y = remove_mu,col='Remove'), show.legend=F) + | |
# uncertainty intervals | |
geom_ribbon(aes(ymax = keep_mu + keep_se*1.65, | |
ymin = keep_mu - keep_se*1.65,fill='Keep Newsom'), | |
col = NA, alpha=0.3) + | |
geom_ribbon(aes(ymax = remove_mu + remove_se*1.65, | |
ymin = remove_mu - remove_se*1.65,fill='Remove'), | |
col = NA, alpha=0.3) + | |
# label | |
geom_label(data = predictions_df %>% filter(end_date == max(end_date)), | |
aes(x = end_date + 5, y = keep_mu, label = paste0(round(keep_mu*100),'%'), col = 'Keep Newsom'),show.legend = F) + | |
geom_label(data = predictions_df %>% filter(end_date == max(end_date)), | |
aes(x = end_date + 5, y = remove_mu, label = paste0(round(remove_mu*100),'%'), col = 'Remove'),show.legend = F) + | |
# settings | |
facet_wrap(~pid_weighted) + | |
scale_y_continuous(breaks=seq(0,1,0.05),labels=function(x){round(x*100)}) + | |
theme_minimal() + | |
theme(legend.position = 'top', | |
panel.grid.minor = element_blank()) + | |
labs(x='Date', | |
y='%', | |
col = "", | |
fill = '', | |
subtitle = "Polling average for the 2021 California recall election if you adjust polls for potential partisan non-response*\nNot allocating undecideds", | |
caption = '*The adjustment model takes the systematic difference in party-weighted and non-party-weighted polls and adjusts unweighted data') | |
# normal model -------------------------------------------------------------- | |
polls2 = polls2 %>% mutate(pid_weighted.fac = factor(pid_weighted)) | |
model3 = brm(keep ~ s(end_date.n, k=5) + (1|pollster) + pid_weighted + undecided + s(end_date.n, by=pid_weighted.fac, k=5), | |
data = polls2, | |
family = 'beta', | |
prior = c( | |
set_prior(prior='normal(350, 100)',class='phi'), | |
set_prior(prior = 'normal(0,0.01)', class = 'sd'), | |
set_prior(prior = 'normal(0.2,0.1)', class = 'Intercept'), | |
set_prior(prior = 'normal(0.10,0.05)', class = 'b', coef = 'pid_weighted'), | |
set_prior(prior = 'normal(-0.02,0.1)', class = 'b', coef = 'undecided'), | |
set_prior(prior = 'normal(0.4,0.7)', class = 'b', coef = 'send_date.n_1'), | |
set_prior(prior = 'normal(1.5,0.7)', class = 'b', coef = 'send_date.n:pid_weighted.fac0_1'), | |
set_prior(prior = 'normal(1.0,0.7)', class = 'b', coef = 'send_date.n:pid_weighted.fac1_1'), | |
set_prior(prior = 'normal(1.0,0.5)', class = 'sds', coef = 's(end_date.n, k = 5)'), | |
set_prior(prior = 'normal(1.5,0.6)', class = 'sds', coef = 's(end_date.n, by = pid_weighted.fac, k = 5)') | |
), | |
backend = 'cmdstan', | |
chains = 4, cores = 4, | |
warmup = 1000, iter = 20000, | |
control = list(adapt_delta = 0.99, max_treedepth = 11)) | |
model3 | |
# get predictions | |
predictions_df2 = | |
tibble(end_date.n = seq(min(polls$end_date.n),max(polls$end_date.n),1), | |
pid_weighted = 1) %>% | |
bind_rows( tibble(end_date.n = seq(min(polls$end_date.n),max(polls$end_date.n),1), | |
pid_weighted = 0) ) %>% | |
mutate(undecided = 0, | |
pid_weighted.fac = factor(pid_weighted)) | |
predictions_out2 = predict(object = model3, | |
newdata = predictions_df2, | |
re_formula = NA) | |
predictions_df2$keep_mu = predictions_out2[,1] | |
predictions_df2$keep_se = predictions_out2[,2] | |
predictions_df2$remove_mu = 1 - predictions_df2$keep_mu | |
predictions_df2$remove_se = predictions_df2$keep_se | |
# plot | |
predictions_df2 = predictions_df2 %>% | |
left_join(polls %>% | |
select(-pid_weighted), by = c("end_date.n")) %>% | |
mutate(end_date = as_date(end_date.n)) %>% | |
mutate(pid_weighted = ifelse(pid_weighted == 1, 'Weighted by party/past vote','Not weighted by party/past vote'), | |
undecided = 0) %>% | |
mutate(tot = keep + remove, | |
keep = keep/tot, | |
remove = remove / tot) | |
ggplot(predictions_df2, aes(x=end_date)) + | |
# polls as points | |
geom_point(aes(y = keep,col='Keep Newsom'),shape=1, show.legend=F) + | |
geom_point(aes(y = remove,col='Remove'),shape=1, show.legend=F) + | |
# trends as lines | |
geom_line(aes(y = keep_mu,col='Keep Newsom'), show.legend=F) + | |
geom_line(aes(y = remove_mu,col='Remove'), show.legend=F) + | |
# uncertainty intervals | |
geom_ribbon(aes(ymax = keep_mu + keep_se*1.65, | |
ymin = keep_mu - keep_se*1.65,fill='Keep Newsom'), | |
col = NA, alpha=0.3) + | |
geom_ribbon(aes(ymax = remove_mu + remove_se*1.65, | |
ymin = remove_mu - remove_se*1.65,fill='Remove'), | |
col = NA, alpha=0.3) + | |
# label | |
geom_label(data = predictions_df2 %>% filter(end_date == max(end_date)), | |
aes(x = end_date + 5, y = keep_mu, label = paste0(round(keep_mu*100),'%'), col = 'Keep Newsom'),show.legend = F) + | |
geom_label(data = predictions_df2 %>% filter(end_date == max(end_date)), | |
aes(x = end_date + 5, y = remove_mu, label = paste0(round(remove_mu*100),'%'), col = 'Remove'),show.legend = F) + | |
# settings | |
facet_wrap(~pid_weighted) + | |
scale_y_continuous(breaks=seq(0,1,0.05),labels=function(x){round(x*100)}) + | |
theme_minimal() + | |
theme(legend.position = 'top', | |
panel.grid.minor = element_blank()) + | |
labs(x='Date', | |
y='%', | |
col = "", | |
fill = '', | |
subtitle = "Polling average for the 2021 California recall election if you adjust polls for potential partisan non-response*\nLetting the model allocate undecideds", | |
caption = '*The adjustment model takes the systematic difference in party-weighted and non-party-weighted polls and adjusts unweighted data') | |
predict(object = model3, | |
newdata = tibble(end_date.n = seq(min(polls$end_date.n),max(polls$end_date.n),1), | |
pid_weighted = 1, | |
undecided = 0) %>% | |
mutate(pid_weighted.fac = factor(pid_weighted)), | |
re_formula = NA, | |
type = 'response') | |
beepr::beep(2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment