elliottmorris/buttigeg_rise.R

## buttigeg_rise.R
# This file runs an analysis of 2020 Democratic primary polls to
# assess relationships between candidate's polling averages using
# variance-covariance matrices.
#
# Read more about it here:
# https://thecrosstab.substack.com/p/buttigiegs-rise-has-cost-biden-and

rm(list=ls())
library(tidyverse)
library(lubridate)
library(zoo)
options(scipen = 1000)


# read in data from 538
polls <- read_csv('https://projects.fivethirtyeight.com/polls-page/president_primary_polls.csv')

# filter to 2020 polls, select correct columns, and correct candidates, in certain geographies
candidates <- c("Biden", "Warren", "Sanders", "Harris", "Buttigieg")

polls <- polls %>%
  filter(cycle==2020,
         state %in% c("Iowa","New Hampshire",NA),
         answer %in% candidates,
         is.na(notes)) %>%
  select(date=end_date,
         geo = state,
         pollster,
         sponsors,
         sample_size,
         population,
         tracking,
         candidate=answer,
         pct) %>%
  mutate(geo = ifelse(is.na(geo),'National',geo),
         date=mdy(date))

# all polls over the last 4 months
polls <- polls %>%
  filter(date >= (ymd(Sys.Date())-(30*5))) %>%
  arrange(date)

# filter out all but most recent tracking polls each week
polls <- polls %>%
  group_by(week = floor(difftime(Sys.Date(),date,units = 'weeks')),
           geo,
           pollster,sponsors,
           candidate) %>%
  mutate(poll_that_week = row_number()) %>%
  filter(!tracking | poll_that_week == 1) %>%
  ungroup()

# cap sample size and generate weight
polls <- polls %>%
  mutate(sample_size = ifelse(sample_size>5000,5000,sample_size),
         wt = sqrt(sample_size/mean(sample_size)))

# select only relevant columns
polls <- polls %>%
  select(date, geo, pollster, candidate, sample_size, wt, pct)

# function to compute rolling weighted average
my_wtd_avg <- function(z){
  print(z)

  avg <- rollapplyr(data = z,
                    width = 60,
                    by.column = F,
                    fill = NA,
                    FUN = function(z){

                      pcts <- as.numeric(z[,'pct'])
                      wts <- as.numeric(z[,'wt'])

                      # exponential decay on weights
                      wts = wts * exp(-0.04 * rev(index(wts)))

                      # return weighted average
                      return(weighted.mean(pcts,wts,na.rm=T))

                    })

  return(z %>% as_tibble() %>% cbind(avg))
  #return(wtdmeans)
}

# get all the data into the right format
avgs <- tibble(date = seq.Date((ymd(Sys.Date())-30*5),
                               ymd(Sys.Date()),by = 'days')) %>% # get 30 extra rows of NAs for the rollapplyr function, which for some reason doesn't know how to handle na.rm = T
  left_join(polls %>%
              select(date, geo, candidate)) %>%
  expand(date,geo,candidate) %>%
  na.omit() %>%
  left_join(polls) %>%
  group_by(candidate, geo)

# actually run the averages!
avgs <- avgs %>%
  group_map(~ my_wtd_avg(.), keep = T) %>%
  do.call('bind_rows',.)

# plot
ggplot(avgs %>% na.omit(),
       aes(x=date,y=pct/100,col=candidate,weight=wt)) +
  geom_point(aes(size=sample_size),alpha=0.2) +
  geom_line(aes(y=avg/100)) +
  scale_size(name = 'n',range = c(0.5,3)) +
  facet_wrap(~geo) +
  theme_minimal() +
  scale_color_brewer("Candidate",palette="Dark2") +
  theme(legend.position = 'top',legend.justification = 'left') +
  labs(title='2020 Democratic primary polling averages',
       subtitle = 'Weighted by recency and sample size',
       y='Percent of Democratic voters',
       x="Date")

# get data for covariance matrix
avgs.cov <- avgs %>%
  filter(!is.na(avg)) %>%
  select(date, geo, pollster, candidate, avg) %>%
  group_by(date, geo, pollster, candidate) %>%
  mutate(col = row_number()) %>%
  spread(candidate, avg)

# function to get covariance
get_cov <- function(x){
  y <- x[,5:ncol(x)]

  z <- cov(y,use='complete.obs')

  print(y)

  dat <- z %>%
    as_tibble() %>%
    mutate(geo = unique(x$geo),
           candidate = rownames(z))
  return(dat)
}


# get covariances
covs <- avgs.cov %>%
  filter(date >= (ymd(Sys.Date())-30)) %>%
  mutate("Other_NA" = 100 - (Biden + Buttigieg + Harris + Sanders + Warren)) %>%
  group_by(geo) %>%
  # standardize avg
  #mutate_at(c("Biden","Buttigieg","Harris","Sanders","Warren","Other_NA"),function(x){x/first(x)}) %>%
  group_map(~ get_cov(.), keep = T) %>%
  do.call('bind_rows',.)

covs %>%
  select(geo,candidate,cov_with_buttigieg = Buttigieg) %>%
  filter(candidate != "Buttigieg") %>% View()


# melt dataframe for plotting
avgs_for_cov_plots <- avgs.cov %>%
  filter(date >= (ymd(Sys.Date())-30)) %>%
  mutate("Other_NA" = 100 - (Biden + Buttigieg + Harris + Sanders + Warren)) %>%
  group_by(geo) %>%
  # standardize avg
  #mutate_at(c("Biden","Buttigieg","Harris","Sanders","Warren","Other_NA"),function(x){x/first(x)}) %>%
  gather(candidate,pct,c("Biden","Harris","Sanders","Warren","Other_NA"))


# plot buttigieg v other candidates over time
ggplot(avgs_for_cov_plots, aes(x=Buttigieg,y=pct,col=geo)) +
  geom_point() +
  stat_smooth(geom='line',method='lm') +
  facet_wrap(~candidate) +
  theme_minimal() +
  scale_color_brewer("Geography",palette="Dark2") +
  theme(legend.position = 'top',legend.justification = 'left') +
  labs(title='Buttigieg has risen while Biden, Harris have fallen',
       subtitle = 'Each point is a candidate\'s average on a different day',
       y='Other candidate\'s polling average',
       x="Buttigieg\'s polling average")
	# This file runs an analysis of 2020 Democratic primary polls to
	# assess relationships between candidate's polling averages using
	# variance-covariance matrices.
	#
	# Read more about it here:
	# https://thecrosstab.substack.com/p/buttigiegs-rise-has-cost-biden-and

	rm(list=ls())
	library(tidyverse)
	library(lubridate)
	library(zoo)
	options(scipen = 1000)


	# read in data from 538
	polls <- read_csv('https://projects.fivethirtyeight.com/polls-page/president_primary_polls.csv')

	# filter to 2020 polls, select correct columns, and correct candidates, in certain geographies
	candidates <- c("Biden", "Warren", "Sanders", "Harris", "Buttigieg")

	polls <- polls %>%
	filter(cycle==2020,
	state %in% c("Iowa","New Hampshire",NA),
	answer %in% candidates,
	is.na(notes)) %>%
	select(date=end_date,
	geo = state,
	pollster,
	sponsors,
	sample_size,
	population,
	tracking,
	candidate=answer,
	pct) %>%
	mutate(geo = ifelse(is.na(geo),'National',geo),
	date=mdy(date))

	# all polls over the last 4 months
	polls <- polls %>%
	filter(date >= (ymd(Sys.Date())-(30*5))) %>%
	arrange(date)

	# filter out all but most recent tracking polls each week
	polls <- polls %>%
	group_by(week = floor(difftime(Sys.Date(),date,units = 'weeks')),
	geo,
	pollster,sponsors,
	candidate) %>%
	mutate(poll_that_week = row_number()) %>%
	filter(!tracking \| poll_that_week == 1) %>%
	ungroup()

	# cap sample size and generate weight
	polls <- polls %>%
	mutate(sample_size = ifelse(sample_size>5000,5000,sample_size),
	wt = sqrt(sample_size/mean(sample_size)))

	# select only relevant columns
	polls <- polls %>%
	select(date, geo, pollster, candidate, sample_size, wt, pct)

	# function to compute rolling weighted average
	my_wtd_avg <- function(z){
	print(z)

	avg <- rollapplyr(data = z,
	width = 60,
	by.column = F,
	fill = NA,
	FUN = function(z){

	pcts <- as.numeric(z[,'pct'])
	wts <- as.numeric(z[,'wt'])

	# exponential decay on weights
	wts = wts * exp(-0.04 * rev(index(wts)))

	# return weighted average
	return(weighted.mean(pcts,wts,na.rm=T))

	})

	return(z %>% as_tibble() %>% cbind(avg))
	#return(wtdmeans)
	}

	# get all the data into the right format
	avgs <- tibble(date = seq.Date((ymd(Sys.Date())-30*5),
	ymd(Sys.Date()),by = 'days')) %>% # get 30 extra rows of NAs for the rollapplyr function, which for some reason doesn't know how to handle na.rm = T
	left_join(polls %>%
	select(date, geo, candidate)) %>%
	expand(date,geo,candidate) %>%
	na.omit() %>%
	left_join(polls) %>%
	group_by(candidate, geo)

	# actually run the averages!
	avgs <- avgs %>%
	group_map(~ my_wtd_avg(.), keep = T) %>%
	do.call('bind_rows',.)

	# plot
	ggplot(avgs %>% na.omit(),
	aes(x=date,y=pct/100,col=candidate,weight=wt)) +
	geom_point(aes(size=sample_size),alpha=0.2) +
	geom_line(aes(y=avg/100)) +
	scale_size(name = 'n',range = c(0.5,3)) +
	facet_wrap(~geo) +
	theme_minimal() +
	scale_color_brewer("Candidate",palette="Dark2") +
	theme(legend.position = 'top',legend.justification = 'left') +
	labs(title='2020 Democratic primary polling averages',
	subtitle = 'Weighted by recency and sample size',
	y='Percent of Democratic voters',
	x="Date")

	# get data for covariance matrix
	avgs.cov <- avgs %>%
	filter(!is.na(avg)) %>%
	select(date, geo, pollster, candidate, avg) %>%
	group_by(date, geo, pollster, candidate) %>%
	mutate(col = row_number()) %>%
	spread(candidate, avg)

	# function to get covariance
	get_cov <- function(x){
	y <- x[,5:ncol(x)]

	z <- cov(y,use='complete.obs')

	print(y)

	dat <- z %>%
	as_tibble() %>%
	mutate(geo = unique(x$geo),
	candidate = rownames(z))
	return(dat)
	}


	# get covariances
	covs <- avgs.cov %>%
	filter(date >= (ymd(Sys.Date())-30)) %>%
	mutate("Other_NA" = 100 - (Biden + Buttigieg + Harris + Sanders + Warren)) %>%
	group_by(geo) %>%
	# standardize avg
	#mutate_at(c("Biden","Buttigieg","Harris","Sanders","Warren","Other_NA"),function(x){x/first(x)}) %>%
	group_map(~ get_cov(.), keep = T) %>%
	do.call('bind_rows',.)

	covs %>%
	select(geo,candidate,cov_with_buttigieg = Buttigieg) %>%
	filter(candidate != "Buttigieg") %>% View()


	# melt dataframe for plotting
	avgs_for_cov_plots <- avgs.cov %>%
	filter(date >= (ymd(Sys.Date())-30)) %>%
	mutate("Other_NA" = 100 - (Biden + Buttigieg + Harris + Sanders + Warren)) %>%
	group_by(geo) %>%
	# standardize avg
	#mutate_at(c("Biden","Buttigieg","Harris","Sanders","Warren","Other_NA"),function(x){x/first(x)}) %>%
	gather(candidate,pct,c("Biden","Harris","Sanders","Warren","Other_NA"))


	# plot buttigieg v other candidates over time
	ggplot(avgs_for_cov_plots, aes(x=Buttigieg,y=pct,col=geo)) +
	geom_point() +
	stat_smooth(geom='line',method='lm') +
	facet_wrap(~candidate) +
	theme_minimal() +
	scale_color_brewer("Geography",palette="Dark2") +
	theme(legend.position = 'top',legend.justification = 'left') +
	labs(title='Buttigieg has risen while Biden, Harris have fallen',
	subtitle = 'Each point is a candidate\'s average on a different day',
	y='Other candidate\'s polling average',
	x="Buttigieg\'s polling average")