Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
library(rvest)
library(magrittr)
library(stringr)
library(lubridate)
library(reshape2)
library(ggplot2)
library(dplyr)
library(ggthemes)
library(choroplethr)
library(choroplethrMaps)
library(acs)
library(RColorBrewer)
### Given a county node, return a dataframe with the results for that county
county_results = function(county){
county_name = county %>% html_node(".results-header div.title") %>% html_text() %>% str_trim()
results = county %>% html_nodes(".results-dataset")
results_table = results %>% html_nodes(".results-table") %>% html_table()
#the state's results are available (for at least one party)
if(length(results_table) > 0){
results_meta = results %>% html_nodes(".results-meta h5") %>% html_text() %>% str_trim()
for(i in 1:length(results_table)){
colnames(results_table[[i]]) = c("candidate", "votes.pct", "votes")
results_table[[i]]$candidate = str_trim(str_replace(results_table[[i]]$candidate, "Winner", ""))
results_table[[i]]$votes = as.numeric(str_replace_all(results_table[[i]]$votes, ",", ""))
results_table[[i]]$party = results_meta[i]
results_table[[i]]$county.name = str_trim(str_replace(tolower(county_name), "county", ""))
}
do.call(rbind, results_table)
}
#state hasnt voted yet
else{
NULL
}
}
### Read the html code for 1 state, calling the county_results function for each of its counties
state_results = function(state_filename){
html = read_html(paste0("states/",state_filename))
county_nodes = html %>% html_nodes("article.results-group")
results = NULL
for(i in 1:length(county_nodes)){
new_result = try(county_results(county_nodes[[i]]))
results = rbind(results, new_result)
}
results$state.name = str_trim(tolower(str_split(state_filename, "\\.")[[1]][1]))
results
}
### Read county data from choroplethr package in order to the region variable for the map.
### also fix a few county names to be able to merge properly
data(county.regions)
all.regions = county.regions %>% select(county.name, state.name, region)
all.regions["1528", "county.name"] = "st. louis city"
all.regions["2953", "county.name"] = "fairfax city"
all.regions["2906", "county.name"] = "richmond city"
all.regions["2844", "county.name"] = "franklin city"
all.regions["2924", "county.name"] = "roanoke city"
### Read demographics data
demographics=get_county_demographics()
#write.csv(demographics, "county_demographics.csv", row.names=FALSE)
#setwd("C:/Users/karim/Documents/R/usa_2016")
#demographics = read.csv("county_demographics.csv")
### note: I removed Kansas and Minesotta from the states directory because they are by district
#all_results = read.csv("all_results.csv", stringsAsFactors=FALSE)
all_results = do.call(rbind, lapply(list.files("states/"), state_results))
write.csv(all_results, "all_results.csv", row.names=FALSE)
### Fix a few county names. This is a bit ugly...
all_results = all_results %>%
mutate(county.name = ifelse(county.name == "saint francis", "st. francis", county.name)) %>%
mutate(county.name = ifelse(county.name == "dewitt" & state.name == "illinois", "de witt", county.name)) %>%
mutate(county.name = str_replace(county.name, " parish", "")) %>%
mutate(county.name = ifelse(state.name=="virginia" & !str_detect(county.name, "charles")
& !str_detect(county.name, "james")
& !str_detect(county.name, "fairfax")
& !str_detect(county.name, "richmond")
& !str_detect(county.name, "roanoke")
& !str_detect(county.name, "franklin"),
str_replace(county.name, " city", ""),
county.name))
###Republicans
#keep Trump and Cruz only
republican = all_results %>% filter(candidate == "D. Trump" | candidate == "T. Cruz")
#dcast to get total votes per county
republican_wide = republican %>% reshape2::dcast(state.name + county.name ~ candidate, value.var="votes")
#transform #votes into percentages
republican_wide[, 3:4] = republican_wide[, 3:4] / (republican_wide[,3] + republican_wide[,4])
#transform back into long format
republican_long = republican_wide %>% reshape2::melt(id.vars = c("state.name", "county.name"),
variable.name = "candidate",
value.name = "votes.percentage")
###Democrats
#keep Trump and Cruz only
democrat = all_results %>% filter(candidate == "H. Clinton" | candidate == "B. Sanders")
#dcast to get total votes per county
democrat_wide = democrat %>% reshape2::dcast(state.name + county.name ~ candidate, value.var="votes")
#transform #votes into percentages
democrat_wide[, 3:4] = democrat_wide[, 3:4] / (democrat_wide[,3] + democrat_wide[,4])
#transform back into long format
democrat_long = democrat_wide %>% reshape2::melt(id.vars = c("state.name", "county.name"),
variable.name = "candidate",
value.name = "votes.percentage")
#recombine republican & democrat data
all_results_long = rbind(republican_long, democrat_long)
all_results_long[is.na(all_results_long$votes.percentage),"votes.percentage"] = 0
#final data, containing the votes percentages and demographics data per county
all_results_long = all_results_long %>%
left_join(all.regions) %>%
left_join(demographics)
#write.csv(all_results_long, "all_results_long.csv", row.names=FALSE)
#candidate.name = "B. Sanders"
plot_map = function(candidate.name, variable){
candidate = all_results_long[all_results_long$candidate == candidate.name,] %>%
dplyr::mutate(votes.index = votes.percentage / median(votes.percentage)) %>%
mutate(votes.indicator = ifelse(votes.index >= 1, 1, -1))
candidate$var.index = candidate[,variable] / median(candidate[,variable])
candidate$var.indicator = ifelse(candidate$var.index >= 1, 2, 1)
candidate$value_tmp = as.factor(candidate$votes.indicator * candidate$var.indicator)
candidate$value = sapply(candidate$value_tmp,
function(x){
if(x == -2){
paste("a. low popularity - high", variable)
}
else if(x==-1){
paste("b. low popularity - low", variable)
}
else if(x==1){
paste("c. high popularity - low", variable)
}
else if(x==2){
paste("d. high popularity - high", variable)
}
})
my_palette = brewer.pal(4, "PRGn")
choro = CountyChoropleth$new(candidate)
choro$title = candidate.name
choro$ggplot_scale = scale_fill_manual(values=my_palette)
choro$render()
}
plot_map("B. Sanders", "per_capita_income")
plot_map("H. Clinton", "per_capita_income")
plot_map("D. Trump", "per_capita_income")
plot_map("D. Trump", "percent_white")
plot_map("D. Trump", "percent_hispanic")
plot_map("T. Cruz", "percent_hispanic")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment