elliottmorris/polling_averages.R

## polling_averages.R
library(tidyverse)
library(politicaldata)
library(lubridate)
library(zoo)
library(gghighlight)

today_2020_time_difference <- as.numeric(difftime(ymd('2020-11-03'),Sys.Date(),units = 'days'))

# wrangle -----------------------------------------------------------------
# historical
history <- politicaldata::us_pres_polls_history %>%
  select(year,date,electionday,daysuntil,
         dem=democrat,rep=republican,
         dem_actual = d.actual.share,
         rep_actual = r.actual.share) %>%
  mutate(date = as.character(date),
         electionday = mdy(electionday))

# 2020 (from 538)
polls <- read_csv('https://projects.fivethirtyeight.com/polls-page/president_polls.csv')

polls <- polls %>% filter(cycle=='2020',is.na(state)) %>%
  mutate(daysuntil = as.numeric(difftime(mdy(election_date),mdy(end_date),units = 'days')),
         end_date = as.character(mdy(end_date)),
         election_date = mdy(election_date)) %>%
  select(year=cycle,date=end_date,pollster,electionday=election_date,daysuntil,name=answer,pct)

polls <- na.omit(polls)

polls <- polls %>%
  filter(name %in% c('Biden','Trump')) %>%
  group_by(pollster,date,name) %>%
  filter(row_number() == 1) %>%
  spread(name,pct) %>%
  ungroup()

# bind
polls <- history %>%
  bind_rows(polls %>%
              select(year,date,electionday,daysuntil,dem=Biden,rep=Trump))%>%
  mutate(date=ymd(date))


# get average for each year -----------------------------------------------
# join on data set of dates
poll_avgs <- lapply(unique(polls$year),
                    function(x){
                      tmp <- polls %>% filter(year==x)

                      tibble(date = seq(min(tmp$date),max(tmp$date),by='day'),
                             year = unique(tmp$year),
                             electionday = unique(tmp$electionday),
                             dem_actual = unique(tmp$dem_actual),
                             rep_actual = unique(tmp$rep_actual))

                    }) %>%
  do.call('rbind',.) %>%
  mutate(daysuntil = as.numeric(difftime(ymd(electionday),ymd(date),units = 'days')))

poll_avgs <- poll_avgs %>%
  left_join(polls %>% select(date,dem,rep))

# group polls by day, add index for weighting purposes
poll_avgs <- poll_avgs %>%
  group_by(date,year,electionday,dem_actual,rep_actual,daysuntil) %>%
  summarise(dem_margin_actual = unique(dem_actual - rep_actual),
            dem_margin_poll = mean(dem-rep),
            weight = length(unique(na.omit(dem-rep)))) %>%
  ungroup() %>%
  arrange(date)

# group up
poll_avgs <- poll_avgs %>%
  group_by(year) %>%
  mutate(dem_margin_average =
           rollsum(dem_margin_poll * weight,14,na.pad=T,align = 'right',na.rm=T)  /
           rollsum(weight,14,na.pad=T,align = 'right',na.rm=T),
         dem_margin_average = ifelse(is.nan(dem_margin_average),NA,dem_margin_average)) %>%
  mutate(dem_margin_average = na.approx(dem_margin_average,maxgap = 1000,na.rm=FALSE))


# graph -------------------------------------------------------------------
# chart
poll_avgs %>%
  ggplot(.,aes(x=daysuntil,
               y=dem_margin_average,
               group=as.factor(year))) +
  geom_hline(yintercept = 0) +
  #geom_point(shape=1) +
  geom_line(aes(y=dem_margin_average,
                col=as.factor(year))) +
  #geom_smooth(aes(y=dem_margin_average,col=as.factor(year)),
  #            method='gam',formula=y~s(x,k=20),
  #            se=F,size=0.5) +
  scale_x_reverse(breaks=rev(seq(0,360,30)),
                  labels=function(x){x/30}) +
  scale_y_continuous(breaks=seq(-100,100,10)) +
  gghighlight(year > 2004) +
  scale_color_brewer(name='Election year',palette='Dark2') +
  labs(x='Months until election day',
       y='',
       subtitle='Average Democratic vote margin in pre-election polls, percentage points',
       caption='Source: Wlezien/Jennings 2019; G. Elliott Morris') +
  theme_minimal() +
  theme(legend.position = 'top',legend.justification = 'left',
        panel.grid.minor = element_blank(),
        plot.caption = element_text(hjust=0)) +
  coord_cartesian(xlim=c(0,300))


# incumbents --------------------------------------------------------------
# graph
poll_avgs <- poll_avgs %>%
  mutate(incumbent_margin_average = ifelse(year %in% c(2016,2012,2000,1996,1980),
                                           dem_margin_average,
                                           -1*dem_margin_average),
         incumbent_margin_actual = ifelse(year %in% c(2016,2012,2000,1996,1980),
                                          dem_margin_actual,
                                          -1*dem_margin_actual))

ggplot(poll_avgs,aes(x=daysuntil,
               y=incumbent_margin_average,
               group=as.factor(year))) +
  geom_hline(yintercept = 0) +
  #geom_point(shape=1) +
  geom_line(aes(y=incumbent_margin_average,
                col=as.factor(year))) +
  #geom_smooth(aes(y=dem_margin_average,col=as.factor(year)),
  #            method='gam',formula=y~s(x,k=20),
  #            se=F,size=0.5) +
  scale_x_reverse(breaks=rev(seq(0,360,30)),
                  labels=function(x){x/30}) +
  scale_y_continuous(breaks=seq(-100,100,10)) +
  gghighlight(year > 2004) +
  scale_color_brewer(name='Election year',palette='Dark2') +
  labs(x='Months until election day',
       y='',
       subtitle='Average incumbent vote margin in pre-election polls, percentage points',
       caption='Source: Wlezien/Jennings 2019; G. Elliott Morris') +
  theme_minimal() +
  theme(legend.position = 'top',legend.justification = 'left',
        panel.grid.minor = element_blank(),
        plot.caption = element_text(hjust=0)) +
  coord_cartesian(xlim=c(0,300))


# error -------------------------------------------------------------------
# get daily error
daily_mae <- poll_avgs %>%
  filter(daysuntil <= 300) %>%
  group_by(daysuntil)  %>%
  summarise(mae = mean(abs(dem_margin_actual - dem_margin_average), na.rm=T))

# plot
ggplot(daily_mae, aes(x=daysuntil,y=mae)) +
  geom_line() +
  scale_x_reverse(limits=rev(c(0,300)),
                  breaks=rev(seq(0,360,30)),
                  labels=function(x){x/30}) +
  scale_y_continuous(limits=c(0,max(daily_mae$mae))) +
  labs(x='Months until election day',
       y='',
       subtitle='Average error between polling averages and actual Democratic vote margins, 1980-2016',
       caption='Source: Wlezien/Jennings 2019; G. Elliott Morris') +
  theme_minimal() +
  theme(legend.position = 'top',legend.justification = 'left',
        panel.grid.minor = element_blank(),
        plot.caption = element_text(hjust=0)) +
  geom_vline(xintercept = today_2020_time_difference,linetype=2)


# glms winning ------------------------------------------------------------
# democratic
dem_models <- poll_avgs %>%
  filter(daysuntil <= 300) %>%
  mutate(dem_win_wh = ifelse(year %in% c(1992,1996,2008,2012),1,0)) %>%
  group_by(daysuntil)  %>%
  do(mod = glm(dem_win_wh ~ dem_margin_average,
               family = binomial(link='logit'),
               data=.))


predict(dem_models[dem_models$daysuntil == today_2020_time_difference,]$mod[[1]],
        tibble(dem_margin_average = last(poll_avgs[poll_avgs$year == 2020,]$dem_margin_average)),
        type = 'response')

# incumbent
inc_models <- poll_avgs %>%
  filter(daysuntil <= 300) %>%
  mutate(incumbent_win_wh = ifelse(year %in% c(1984,1988,1996,2004,2012),1,0)) %>%
  group_by(daysuntil)  %>%
  do(mod = glm((incumbent_win_wh > 0) ~ incumbent_margin_average,
               family = binomial(link='logit'),
               data=.))


predict(inc_models[inc_models$daysuntil == today_2020_time_difference,]$mod[[1]],
        tibble(incumbent_margin_average = last(poll_avgs[poll_avgs$year == 2020,]$dem_margin_average)*-1),
        type = 'response')
	library(tidyverse)
	library(politicaldata)
	library(lubridate)
	library(zoo)
	library(gghighlight)

	today_2020_time_difference <- as.numeric(difftime(ymd('2020-11-03'),Sys.Date(),units = 'days'))

	# wrangle -----------------------------------------------------------------
	# historical
	history <- politicaldata::us_pres_polls_history %>%
	select(year,date,electionday,daysuntil,
	dem=democrat,rep=republican,
	dem_actual = d.actual.share,
	rep_actual = r.actual.share) %>%
	mutate(date = as.character(date),
	electionday = mdy(electionday))

	# 2020 (from 538)
	polls <- read_csv('https://projects.fivethirtyeight.com/polls-page/president_polls.csv')

	polls <- polls %>% filter(cycle=='2020',is.na(state)) %>%
	mutate(daysuntil = as.numeric(difftime(mdy(election_date),mdy(end_date),units = 'days')),
	end_date = as.character(mdy(end_date)),
	election_date = mdy(election_date)) %>%
	select(year=cycle,date=end_date,pollster,electionday=election_date,daysuntil,name=answer,pct)

	polls <- na.omit(polls)

	polls <- polls %>%
	filter(name %in% c('Biden','Trump')) %>%
	group_by(pollster,date,name) %>%
	filter(row_number() == 1) %>%
	spread(name,pct) %>%
	ungroup()

	# bind
	polls <- history %>%
	bind_rows(polls %>%
	select(year,date,electionday,daysuntil,dem=Biden,rep=Trump))%>%
	mutate(date=ymd(date))


	# get average for each year -----------------------------------------------
	# join on data set of dates
	poll_avgs <- lapply(unique(polls$year),
	function(x){
	tmp <- polls %>% filter(year==x)

	tibble(date = seq(min(tmp$date),max(tmp$date),by='day'),
	year = unique(tmp$year),
	electionday = unique(tmp$electionday),
	dem_actual = unique(tmp$dem_actual),
	rep_actual = unique(tmp$rep_actual))

	}) %>%
	do.call('rbind',.) %>%
	mutate(daysuntil = as.numeric(difftime(ymd(electionday),ymd(date),units = 'days')))

	poll_avgs <- poll_avgs %>%
	left_join(polls %>% select(date,dem,rep))

	# group polls by day, add index for weighting purposes
	poll_avgs <- poll_avgs %>%
	group_by(date,year,electionday,dem_actual,rep_actual,daysuntil) %>%
	summarise(dem_margin_actual = unique(dem_actual - rep_actual),
	dem_margin_poll = mean(dem-rep),
	weight = length(unique(na.omit(dem-rep)))) %>%
	ungroup() %>%
	arrange(date)

	# group up
	poll_avgs <- poll_avgs %>%
	group_by(year) %>%
	mutate(dem_margin_average =
	rollsum(dem_margin_poll * weight,14,na.pad=T,align = 'right',na.rm=T) /
	rollsum(weight,14,na.pad=T,align = 'right',na.rm=T),
	dem_margin_average = ifelse(is.nan(dem_margin_average),NA,dem_margin_average)) %>%
	mutate(dem_margin_average = na.approx(dem_margin_average,maxgap = 1000,na.rm=FALSE))


	# graph -------------------------------------------------------------------
	# chart
	poll_avgs %>%
	ggplot(.,aes(x=daysuntil,
	y=dem_margin_average,
	group=as.factor(year))) +
	geom_hline(yintercept = 0) +
	#geom_point(shape=1) +
	geom_line(aes(y=dem_margin_average,
	col=as.factor(year))) +
	#geom_smooth(aes(y=dem_margin_average,col=as.factor(year)),
	# method='gam',formula=y~s(x,k=20),
	# se=F,size=0.5) +
	scale_x_reverse(breaks=rev(seq(0,360,30)),
	labels=function(x){x/30}) +
	scale_y_continuous(breaks=seq(-100,100,10)) +
	gghighlight(year > 2004) +
	scale_color_brewer(name='Election year',palette='Dark2') +
	labs(x='Months until election day',
	y='',
	subtitle='Average Democratic vote margin in pre-election polls, percentage points',
	caption='Source: Wlezien/Jennings 2019; G. Elliott Morris') +
	theme_minimal() +
	theme(legend.position = 'top',legend.justification = 'left',
	panel.grid.minor = element_blank(),
	plot.caption = element_text(hjust=0)) +
	coord_cartesian(xlim=c(0,300))


	# incumbents --------------------------------------------------------------
	# graph
	poll_avgs <- poll_avgs %>%
	mutate(incumbent_margin_average = ifelse(year %in% c(2016,2012,2000,1996,1980),
	dem_margin_average,
	-1*dem_margin_average),
	incumbent_margin_actual = ifelse(year %in% c(2016,2012,2000,1996,1980),
	dem_margin_actual,
	-1*dem_margin_actual))

	ggplot(poll_avgs,aes(x=daysuntil,
	y=incumbent_margin_average,
	group=as.factor(year))) +
	geom_hline(yintercept = 0) +
	#geom_point(shape=1) +
	geom_line(aes(y=incumbent_margin_average,
	col=as.factor(year))) +
	#geom_smooth(aes(y=dem_margin_average,col=as.factor(year)),
	# method='gam',formula=y~s(x,k=20),
	# se=F,size=0.5) +
	scale_x_reverse(breaks=rev(seq(0,360,30)),
	labels=function(x){x/30}) +
	scale_y_continuous(breaks=seq(-100,100,10)) +
	gghighlight(year > 2004) +
	scale_color_brewer(name='Election year',palette='Dark2') +
	labs(x='Months until election day',
	y='',
	subtitle='Average incumbent vote margin in pre-election polls, percentage points',
	caption='Source: Wlezien/Jennings 2019; G. Elliott Morris') +
	theme_minimal() +
	theme(legend.position = 'top',legend.justification = 'left',
	panel.grid.minor = element_blank(),
	plot.caption = element_text(hjust=0)) +
	coord_cartesian(xlim=c(0,300))


	# error -------------------------------------------------------------------
	# get daily error
	daily_mae <- poll_avgs %>%
	filter(daysuntil <= 300) %>%
	group_by(daysuntil) %>%
	summarise(mae = mean(abs(dem_margin_actual - dem_margin_average), na.rm=T))

	# plot
	ggplot(daily_mae, aes(x=daysuntil,y=mae)) +
	geom_line() +
	scale_x_reverse(limits=rev(c(0,300)),
	breaks=rev(seq(0,360,30)),
	labels=function(x){x/30}) +
	scale_y_continuous(limits=c(0,max(daily_mae$mae))) +
	labs(x='Months until election day',
	y='',
	subtitle='Average error between polling averages and actual Democratic vote margins, 1980-2016',
	caption='Source: Wlezien/Jennings 2019; G. Elliott Morris') +
	theme_minimal() +
	theme(legend.position = 'top',legend.justification = 'left',
	panel.grid.minor = element_blank(),
	plot.caption = element_text(hjust=0)) +
	geom_vline(xintercept = today_2020_time_difference,linetype=2)



	# glms winning ------------------------------------------------------------
	# democratic
	dem_models <- poll_avgs %>%
	filter(daysuntil <= 300) %>%
	mutate(dem_win_wh = ifelse(year %in% c(1992,1996,2008,2012),1,0)) %>%
	group_by(daysuntil) %>%
	do(mod = glm(dem_win_wh ~ dem_margin_average,
	family = binomial(link='logit'),
	data=.))


	predict(dem_models[dem_models$daysuntil == today_2020_time_difference,]$mod[[1]],
	tibble(dem_margin_average = last(poll_avgs[poll_avgs$year == 2020,]$dem_margin_average)),
	type = 'response')

	# incumbent
	inc_models <- poll_avgs %>%
	filter(daysuntil <= 300) %>%
	mutate(incumbent_win_wh = ifelse(year %in% c(1984,1988,1996,2004,2012),1,0)) %>%
	group_by(daysuntil) %>%
	do(mod = glm((incumbent_win_wh > 0) ~ incumbent_margin_average,
	family = binomial(link='logit'),
	data=.))


	predict(inc_models[inc_models$daysuntil == today_2020_time_difference,]$mod[[1]],
	tibble(incumbent_margin_average = last(poll_avgs[poll_avgs$year == 2020,]$dem_margin_average)*-1),
	type = 'response')