Created
November 5, 2020 14:59
-
-
Save reuning/9ad061d9b07480a9b4b95a7610c219b4 to your computer and use it in GitHub Desktop.
Compare polling and election margins in R. Data scraped from NYTimes results.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rvest) | |
library(data.table) | |
library(ggplot2) | |
library(ggrepel) | |
page <- read_html("https://www.nytimes.com/interactive/2020/11/03/us/elections/results-president.html") | |
links <- page %>% html_node(".e-state-list") %>% html_nodes("a") %>% html_attr("href") | |
get_data <- function(link){ | |
page <- read_html(link) | |
trump <- page %>% html_node("tr.e-donald-j-trump > td.e-percent" ) %>% html_text(trim=T) | |
biden <- page %>% html_node("tr.e-joseph-r-biden > td.e-percent" ) %>% html_text(trim=T) | |
pct_all <- page %>% html_node("span.e-est-pct") %>% html_text(trim=T) | |
state <- page %>% html_node("h1[itemprop='headline']") %>% html_text(trim=T) | |
m <- regexpr("\\d+", pct_all) | |
pct <- as.numeric(regmatches(pct_all, m)) | |
trump <- as.numeric(gsub("%", "", trump)) | |
biden <- as.numeric(gsub("%", "", biden)) | |
state <- gsub(" Election Results", "" ,state ) | |
return(data.frame("state"=state, "trump"=trump, | |
"biden"=biden, "pct_all"=pct_all, | |
"pct"=pct)) | |
} | |
tables <- lapply(links, get_data) | |
table <- do.call(rbind, tables) | |
poll_data <- fread("https://projects.fivethirtyeight.com/polls-page/president_polls.csv") | |
poll_data <- poll_data[state!=""] | |
poll_data[, end_data:=as.Date(end_date, format = "%m/%d/%y")] | |
poll_data[, start_date:=as.Date(start_date, format = "%m/%d/%y")] | |
# poll_data <- poll_data[start_date >= as.Date("2020-10-20")] | |
poll_data$id <- paste(poll_data$question_id, poll_data$poll_id, sep="-") | |
poll_data <- dcast(poll_data, start_date+ end_date + state+id + pollster + fte_grade + | |
sample_size + population +methodology~candidate_name, value.var="pct") | |
poll_data <- merge(poll_data, table, by="state", all.x=T) | |
ggplot(poll_data[pct>97 & start_date >= as.Date("2020-10-20")], | |
aes(y=biden-trump, | |
x=`Joseph R. Biden Jr.` - `Donald Trump`, | |
color=state)) + | |
# geom_boxplot(width=4,outlier.size = .5, | |
# varwidth = F, orientation = "y", position = position_identity()) + | |
geom_point(size=1.5, alpha=.7) + | |
geom_abline(slope=1, intercept=0, color='gray') + | |
theme_minimal() + labs(y="Vote Margin\n(Biden-Trump)", | |
x="Polling Margin\n(Biden-Trump)", | |
caption="Only including polls since Oct 20th and states with >97% reporting") + | |
geom_text(aes(label=state), x=40) + | |
scale_x_continuous(limits = c(-40,50)) + | |
scale_y_continuous(limits = c(-40,50)) + | |
guides(color=F) + geom_smooth(method='lm', | |
aes(y=biden-trump, | |
x=`Joseph R. Biden Jr.` - `Donald Trump`), | |
inherit.aes = F, | |
se=F,color='black') | |
setwd("~/Desktop/") | |
ggsave("polling_results.png", height=5, width=7) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment