Skip to content

Instantly share code, notes, and snippets.

@elliottmorris
Last active March 21, 2020 00:26
Show Gist options
  • Save elliottmorris/0cd4c95cb2424ffbe8c0a5ecb02d8cf6 to your computer and use it in GitHub Desktop.
Save elliottmorris/0cd4c95cb2424ffbe8c0a5ecb02d8cf6 to your computer and use it in GitHub Desktop.
historical polling averages + 2020
library(tidyverse)
library(politicaldata)
library(lubridate)
library(zoo)
library(gghighlight)
today_2020_time_difference <- as.numeric(difftime(ymd('2020-11-03'),Sys.Date(),units = 'days'))
# wrangle -----------------------------------------------------------------
# historical
history <- politicaldata::us_pres_polls_history %>%
select(year,date,electionday,daysuntil,
dem=democrat,rep=republican,
dem_actual = d.actual.share,
rep_actual = r.actual.share) %>%
mutate(date = as.character(date),
electionday = mdy(electionday))
# 2020 (from 538)
polls <- read_csv('https://projects.fivethirtyeight.com/polls-page/president_polls.csv')
polls <- polls %>% filter(cycle=='2020',is.na(state)) %>%
mutate(daysuntil = as.numeric(difftime(mdy(election_date),mdy(end_date),units = 'days')),
end_date = as.character(mdy(end_date)),
election_date = mdy(election_date)) %>%
select(year=cycle,date=end_date,pollster,electionday=election_date,daysuntil,name=answer,pct)
polls <- na.omit(polls)
polls <- polls %>%
filter(name %in% c('Biden','Trump')) %>%
group_by(pollster,date,name) %>%
filter(row_number() == 1) %>%
spread(name,pct) %>%
ungroup()
# bind
polls <- history %>%
bind_rows(polls %>%
select(year,date,electionday,daysuntil,dem=Biden,rep=Trump))%>%
mutate(date=ymd(date))
# get average for each year -----------------------------------------------
# join on data set of dates
poll_avgs <- lapply(unique(polls$year),
function(x){
tmp <- polls %>% filter(year==x)
tibble(date = seq(min(tmp$date),max(tmp$date),by='day'),
year = unique(tmp$year),
electionday = unique(tmp$electionday),
dem_actual = unique(tmp$dem_actual),
rep_actual = unique(tmp$rep_actual))
}) %>%
do.call('rbind',.) %>%
mutate(daysuntil = as.numeric(difftime(ymd(electionday),ymd(date),units = 'days')))
poll_avgs <- poll_avgs %>%
left_join(polls %>% select(date,dem,rep))
# group polls by day, add index for weighting purposes
poll_avgs <- poll_avgs %>%
group_by(date,year,electionday,dem_actual,rep_actual,daysuntil) %>%
summarise(dem_margin_actual = unique(dem_actual - rep_actual),
dem_margin_poll = mean(dem-rep),
weight = length(unique(na.omit(dem-rep)))) %>%
ungroup() %>%
arrange(date)
# group up
poll_avgs <- poll_avgs %>%
group_by(year) %>%
mutate(dem_margin_average =
rollsum(dem_margin_poll * weight,14,na.pad=T,align = 'right',na.rm=T) /
rollsum(weight,14,na.pad=T,align = 'right',na.rm=T),
dem_margin_average = ifelse(is.nan(dem_margin_average),NA,dem_margin_average)) %>%
mutate(dem_margin_average = na.approx(dem_margin_average,maxgap = 1000,na.rm=FALSE))
# graph -------------------------------------------------------------------
# chart
poll_avgs %>%
ggplot(.,aes(x=daysuntil,
y=dem_margin_average,
group=as.factor(year))) +
geom_hline(yintercept = 0) +
#geom_point(shape=1) +
geom_line(aes(y=dem_margin_average,
col=as.factor(year))) +
#geom_smooth(aes(y=dem_margin_average,col=as.factor(year)),
# method='gam',formula=y~s(x,k=20),
# se=F,size=0.5) +
scale_x_reverse(breaks=rev(seq(0,360,30)),
labels=function(x){x/30}) +
scale_y_continuous(breaks=seq(-100,100,10)) +
gghighlight(year > 2004) +
scale_color_brewer(name='Election year',palette='Dark2') +
labs(x='Months until election day',
y='',
subtitle='Average Democratic vote margin in pre-election polls, percentage points',
caption='Source: Wlezien/Jennings 2019; G. Elliott Morris') +
theme_minimal() +
theme(legend.position = 'top',legend.justification = 'left',
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust=0)) +
coord_cartesian(xlim=c(0,300))
# incumbents --------------------------------------------------------------
# graph
poll_avgs <- poll_avgs %>%
mutate(incumbent_margin_average = ifelse(year %in% c(2016,2012,2000,1996,1980),
dem_margin_average,
-1*dem_margin_average),
incumbent_margin_actual = ifelse(year %in% c(2016,2012,2000,1996,1980),
dem_margin_actual,
-1*dem_margin_actual))
ggplot(poll_avgs,aes(x=daysuntil,
y=incumbent_margin_average,
group=as.factor(year))) +
geom_hline(yintercept = 0) +
#geom_point(shape=1) +
geom_line(aes(y=incumbent_margin_average,
col=as.factor(year))) +
#geom_smooth(aes(y=dem_margin_average,col=as.factor(year)),
# method='gam',formula=y~s(x,k=20),
# se=F,size=0.5) +
scale_x_reverse(breaks=rev(seq(0,360,30)),
labels=function(x){x/30}) +
scale_y_continuous(breaks=seq(-100,100,10)) +
gghighlight(year > 2004) +
scale_color_brewer(name='Election year',palette='Dark2') +
labs(x='Months until election day',
y='',
subtitle='Average incumbent vote margin in pre-election polls, percentage points',
caption='Source: Wlezien/Jennings 2019; G. Elliott Morris') +
theme_minimal() +
theme(legend.position = 'top',legend.justification = 'left',
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust=0)) +
coord_cartesian(xlim=c(0,300))
# error -------------------------------------------------------------------
# get daily error
daily_mae <- poll_avgs %>%
filter(daysuntil <= 300) %>%
group_by(daysuntil) %>%
summarise(mae = mean(abs(dem_margin_actual - dem_margin_average), na.rm=T))
# plot
ggplot(daily_mae, aes(x=daysuntil,y=mae)) +
geom_line() +
scale_x_reverse(limits=rev(c(0,300)),
breaks=rev(seq(0,360,30)),
labels=function(x){x/30}) +
scale_y_continuous(limits=c(0,max(daily_mae$mae))) +
labs(x='Months until election day',
y='',
subtitle='Average error between polling averages and actual Democratic vote margins, 1980-2016',
caption='Source: Wlezien/Jennings 2019; G. Elliott Morris') +
theme_minimal() +
theme(legend.position = 'top',legend.justification = 'left',
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust=0)) +
geom_vline(xintercept = today_2020_time_difference,linetype=2)
# glms winning ------------------------------------------------------------
# democratic
dem_models <- poll_avgs %>%
filter(daysuntil <= 300) %>%
mutate(dem_win_wh = ifelse(year %in% c(1992,1996,2008,2012),1,0)) %>%
group_by(daysuntil) %>%
do(mod = glm(dem_win_wh ~ dem_margin_average,
family = binomial(link='logit'),
data=.))
predict(dem_models[dem_models$daysuntil == today_2020_time_difference,]$mod[[1]],
tibble(dem_margin_average = last(poll_avgs[poll_avgs$year == 2020,]$dem_margin_average)),
type = 'response')
# incumbent
inc_models <- poll_avgs %>%
filter(daysuntil <= 300) %>%
mutate(incumbent_win_wh = ifelse(year %in% c(1984,1988,1996,2004,2012),1,0)) %>%
group_by(daysuntil) %>%
do(mod = glm((incumbent_win_wh > 0) ~ incumbent_margin_average,
family = binomial(link='logit'),
data=.))
predict(inc_models[inc_models$daysuntil == today_2020_time_difference,]$mod[[1]],
tibble(incumbent_margin_average = last(poll_avgs[poll_avgs$year == 2020,]$dem_margin_average)*-1),
type = 'response')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment