Skip to content

Instantly share code, notes, and snippets.

@elliottmorris
Last active August 16, 2020 15:16
Show Gist options
  • Save elliottmorris/ae1747e1bbb2fc795e49164df4c85987 to your computer and use it in GitHub Desktop.
Save elliottmorris/ae1747e1bbb2fc795e49164df4c85987 to your computer and use it in GitHub Desktop.
Code to make a chart that compares Biden's polling numbers to Hillary Clinton's 2016 performance
library(tidyverse)
library(janitor)
library(lubridate)
library(zoo)
library(politicaldata)
RUN_DATE <- Sys.Date()
start_date <- ymd("2020-02-01")
election_day <- ymd('2020-11-03')
# all_polls <- read.csv("data/all_polls.csv", stringsAsFactors = FALSE, header = TRUE)
url<- 'https://docs.google.com/spreadsheets/d/e/2PACX-1vQ56fySJKLL18Lipu1_i3ID9JE06voJEz2EXm6JW4Vh11zmndyTwejMavuNntzIWLY0RyhA1UsVEen0/pub?gid=0&single=true&output=csv'
all_polls <- read_csv(url)
# remove any polls if biden or trump blank
all_polls <- all_polls %>% filter(!is.na(biden),!is.na(trump))#, include == "TRUE")
# save output for later
write_csv(all_polls %>% filter(state=='--') %>% clean_names %>%
mutate_at(c('biden','trump','other','undecided','biden_margin'),
function(x){x/100}) %>%
mutate(end_date = mdy(end_date),
biden_two_party = biden / (biden + trump),
trump_two_party = trump / (biden + trump)) %>%
filter(end_date >= ymd("2020-03-01")),
'output/site_data/national_polls.csv')
write_csv(all_polls %>% filter(state!='--') %>% clean_names %>%
mutate_at(c('biden','trump','other','undecided','biden_margin'),
function(x){x/100}) %>%
mutate(end_date = mdy(end_date),
biden_two_party = biden / (biden + trump),
trump_two_party = trump / (biden + trump)) %>%
filter(end_date >= ymd("2020-03-01")),
'output/site_data/state_polls.csv')
# remove polls from before today
all_polls <- all_polls %>%
mutate(entry.date = as_date(entry.date.time..et., format = '%m/%d/%Y %H:%M',tz='US/Eastern')) %>%
filter(entry.date <= (ymd(RUN_DATE)))
# take out state-level online polls
#all_polls <- all_polls %>% filter(state == '--' | grepl('live',tolower(mode)))
# limit the number of observations
all_polls$number.of.observations = ifelse(all_polls$number.of.observations > 3000, 3000, all_polls$number.of.observations)
# select relevant columns google sheet, make mutations
all_polls <- all_polls %>%
dplyr::select(state, pollster, number.of.observations, population, mode,
start.date, end.date, entry.date,
biden, trump, undecided, other) %>%
#filter(mdy(end.date) <= RUN_DATE) %>%
mutate(start.date = as.character(mdy(start.date)),
end.date = as.character(mdy(end.date))) %>%
mutate(population = case_when(population == 'lv' ~ 'Likely Voters',
population == 'rv' ~ 'Registered Voters',
population == 'a' ~ 'Adults'))
# basic mutations
df <- all_polls %>%
tbl_df %>%
rename(n = number.of.observations) %>%
mutate(begin = ymd(start.date),
end = ymd(end.date),
t = end - (1 + as.numeric(end-begin)) %/% 2) %>%
filter(t >= start_date & !is.na(t)
& (population == "Likely Voters" |
population == "Registered Voters" |
population == "Adults") # get rid of disaggregated polls
& n > 1)
# pollster mutations
df <- df %>%
mutate(pollster = str_extract(pollster, pattern = "[A-z0-9 ]+") %>% sub("\\s+$", "", .),
pollster = replace(pollster, pollster == "Fox News", "FOX"), # Fixing inconsistencies in pollster names
pollster = replace(pollster, pollster == "WashPost", "Washington Post"),
pollster = replace(pollster, pollster == "ABC News", "ABC"),
pollster = replace(pollster, pollster == "DHM Research", "DHM"),
pollster = replace(pollster, pollster == "Public Opinion Strategies", "POS"),
undecided = ifelse(is.na(undecided), 0, undecided),
other = ifelse(is.na(other), 0, other))# +
# mode mutations
table(df$mode)
df <- df %>%
mutate(mode = case_when(mode == 'Internet' | mode == 'Online' ~ 'Online poll',
grepl("live phone",tolower(mode)) ~ 'Live phone component',
TRUE ~ 'Other'))
table(df$mode)
table(df$population)
# vote shares etc
df <- df %>%
mutate(two_party_sum = biden + trump,
polltype = population,#as.integer(as.character(recode(population,
# "Likely Voters" = "0",
# "Registered Voters" = "1",
# "Adults" = "2"))),
n_respondents = round(n),
# biden
n_biden = round(n * biden/100),
pct_biden = biden/two_party_sum,
n_trump = round(n * trump/100),
pct_trump = trump/two_party_sum)
## --- numerical indices
state_abb_list <- read.csv("data/potus_results_76_16.csv") %>%
pull(state) %>% unique()
df <- df %>%
mutate(poll_day = t - min(t) + 1,
# Factors are alphabetically sorted: 1 = --, 2 = AL, 3 = AK, 4 = AZ...
index_s = as.numeric(factor(as.character(state),
levels = c('--',state_abb_list))),
index_s = ifelse(index_s == 1, 52, index_s - 1),
index_t = 1 + as.numeric(t) - min(as.numeric(t)),
index_p = as.numeric(as.factor(as.character(pollster))),
index_m = as.numeric(as.factor(as.character(mode))),
index_pop = as.numeric(as.factor(as.character(polltype)))) %>%
# selections
arrange(state, t, polltype, two_party_sum) %>%
distinct(state, t, pollster, .keep_all = TRUE) %>%
select(
# poll information
state, t, begin, end, pollster, polltype, method = mode, n_respondents,
# vote shares
pct_biden, n_biden,
pct_trump, n_trump,
poll_day, index_s, index_p, index_m, index_pop, index_t)
# useful vectors
all_polled_states <- df$state %>% unique %>% sort
# day indices
first_day <- min(df$begin)
ndays <- max(df$t) - min(df$t)
all_t <- min(df$t) + days(0:(ndays))
all_t_until_election <- min(all_t) + days(0:(election_day - min(all_t)))
# pollster indices
all_pollsters <- levels(as.factor(as.character(df$pollster)))
# graph of polls relative to Clinton 2016
delta_df <- df %>%
select(state,end,pct_biden,method,n_respondents) %>%
left_join(politicaldata::pres_results %>%
filter(year==2016) %>%
mutate(pct_clinton = dem/(dem+rep)) %>%
select(state,pct_clinton) %>%
bind_rows(tibble(state='--',pct_clinton = 0.511))) %>%
filter(end <= Sys.Date()) %>%
arrange(end) %>%
mutate(level = ifelse(state=='--','National','State'),
delta = (pct_biden - pct_clinton)*2*100)
delta_ts <- lapply(unique(delta_df$level),
function(i){
lapply(unique(delta_df$method),
function(j){
tmp <- delta_df %>% filter(level == i, method == j)
lapply(seq.Date(ymd("2020-02-01"),Sys.Date(),'day'),
function(k){
tmp$day_from_k <- as.numeric(abs(difftime(k,tmp$end,units = 'days'))) + 1
tibble(end = k,
level = i,
method = j,
average_delta = weighted.mean(tmp[tmp$end >= (k-60) & tmp$end <= k,]$delta,
tmp[tmp$end >= (k-60) & tmp$end <= k,]$n_respondents *
exp(-0.04 * tmp[tmp$end >= (k-60) & tmp$end <= k,]$day_from_k)
))
}) %>% bind_rows
})%>% bind_rows
}) %>% bind_rows %>%
group_by(level,method) %>%
mutate(average_delta = imputeTS::na_kalman(average_delta))
delta_ts_all <- lapply(unique(delta_df$level),
function(i){
tmp <- delta_df %>% filter(level == i)
lapply(seq.Date(ymd("2020-02-01"),Sys.Date(),'day'),
function(k){
tmp$day_from_k <- as.numeric(abs(difftime(k,tmp$end,units = 'days'))) + 1
tibble(end = k,
level = i,
method = 'All polls',
average_delta = weighted.mean(tmp[tmp$end >= (k-60) & tmp$end <= k,]$delta,
tmp[tmp$end >= (k-60) & tmp$end <= k,]$n_respondents *
exp(-0.04 * tmp[tmp$end >= (k-60) & tmp$end <= k,]$day_from_k)
))
}) %>% bind_rows
}) %>% bind_rows %>%
group_by(level,method) %>%
mutate(average_delta = imputeTS::na_kalman(average_delta))
gg <- ggplot(delta_df,aes(x=end,group=method,col=method)) +
geom_hline(yintercept = 0,col='gray30') +
# polls and sooth
geom_point(aes(y=delta),alpha=0.2) +
#geom_smooth(aes(y=delta),span=0.8,show.legend = F,se=F) +
# average lines
#geom_line(data=delta_ts,aes(y=average_delta)) +
#geom_line(data=delta_ts_all,inherit.aes = F,aes(x=end,y=average_delta),col='black',linetype=2,show.legend=F) +
# average smooths
geom_smooth(data=delta_ts,aes(y=average_delta),span=0.1,se=F,size=0.5) +
geom_smooth(data=delta_ts_all,inherit.aes = F,aes(x=end,y=average_delta),col='black',linetype=2,span=0.1,show.legend=F,se=F,size=0.7) +
labs(x='Date',y='',subtitle='Biden margin in polls minus Clinton margin in 2016, by geography and poll mode') +
# the rest
theme_minimal() +
theme(legend.position = 'top',panel.grid.minor = element_blank()) +
scale_y_continuous(breaks=seq(-100,100,2),limits=c(-2,14)) +
scale_x_date(date_breaks = 'month',date_labels = '%b',
limits = c(ymd('2020-03-01'),Sys.Date())) +
scale_color_brewer(palette='Set1',name='Poll mode') +
facet_wrap(~level)
print(gg)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment