Last active
April 15, 2016 23:29
-
-
Save klahrich/9cddac8e84ca0a2780c37b821f3336b6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rvest) | |
library(magrittr) | |
library(stringr) | |
library(lubridate) | |
library(reshape2) | |
library(ggplot2) | |
library(dplyr) | |
library(ggthemes) | |
library(choroplethr) | |
library(choroplethrMaps) | |
library(acs) | |
library(RColorBrewer) | |
### Given a county node, return a dataframe with the results for that county | |
county_results = function(county){ | |
county_name = county %>% html_node(".results-header div.title") %>% html_text() %>% str_trim() | |
results = county %>% html_nodes(".results-dataset") | |
results_table = results %>% html_nodes(".results-table") %>% html_table() | |
#the state's results are available (for at least one party) | |
if(length(results_table) > 0){ | |
results_meta = results %>% html_nodes(".results-meta h5") %>% html_text() %>% str_trim() | |
for(i in 1:length(results_table)){ | |
colnames(results_table[[i]]) = c("candidate", "votes.pct", "votes") | |
results_table[[i]]$candidate = str_trim(str_replace(results_table[[i]]$candidate, "Winner", "")) | |
results_table[[i]]$votes = as.numeric(str_replace_all(results_table[[i]]$votes, ",", "")) | |
results_table[[i]]$party = results_meta[i] | |
results_table[[i]]$county.name = str_trim(str_replace(tolower(county_name), "county", "")) | |
} | |
do.call(rbind, results_table) | |
} | |
#state hasnt voted yet | |
else{ | |
NULL | |
} | |
} | |
### Read the html code for 1 state, calling the county_results function for each of its counties | |
state_results = function(state_filename){ | |
html = read_html(paste0("states/",state_filename)) | |
county_nodes = html %>% html_nodes("article.results-group") | |
results = NULL | |
for(i in 1:length(county_nodes)){ | |
new_result = try(county_results(county_nodes[[i]])) | |
results = rbind(results, new_result) | |
} | |
results$state.name = str_trim(tolower(str_split(state_filename, "\\.")[[1]][1])) | |
results | |
} | |
### Read county data from choroplethr package in order to the region variable for the map. | |
### also fix a few county names to be able to merge properly | |
data(county.regions) | |
all.regions = county.regions %>% select(county.name, state.name, region) | |
all.regions["1528", "county.name"] = "st. louis city" | |
all.regions["2953", "county.name"] = "fairfax city" | |
all.regions["2906", "county.name"] = "richmond city" | |
all.regions["2844", "county.name"] = "franklin city" | |
all.regions["2924", "county.name"] = "roanoke city" | |
### Read demographics data | |
demographics=get_county_demographics() | |
#write.csv(demographics, "county_demographics.csv", row.names=FALSE) | |
#setwd("C:/Users/karim/Documents/R/usa_2016") | |
#demographics = read.csv("county_demographics.csv") | |
### note: I removed Kansas and Minesotta from the states directory because they are by district | |
#all_results = read.csv("all_results.csv", stringsAsFactors=FALSE) | |
all_results = do.call(rbind, lapply(list.files("states/"), state_results)) | |
write.csv(all_results, "all_results.csv", row.names=FALSE) | |
### Fix a few county names. This is a bit ugly... | |
all_results = all_results %>% | |
mutate(county.name = ifelse(county.name == "saint francis", "st. francis", county.name)) %>% | |
mutate(county.name = ifelse(county.name == "dewitt" & state.name == "illinois", "de witt", county.name)) %>% | |
mutate(county.name = str_replace(county.name, " parish", "")) %>% | |
mutate(county.name = ifelse(state.name=="virginia" & !str_detect(county.name, "charles") | |
& !str_detect(county.name, "james") | |
& !str_detect(county.name, "fairfax") | |
& !str_detect(county.name, "richmond") | |
& !str_detect(county.name, "roanoke") | |
& !str_detect(county.name, "franklin"), | |
str_replace(county.name, " city", ""), | |
county.name)) | |
###Republicans | |
#keep Trump and Cruz only | |
republican = all_results %>% filter(candidate == "D. Trump" | candidate == "T. Cruz") | |
#dcast to get total votes per county | |
republican_wide = republican %>% reshape2::dcast(state.name + county.name ~ candidate, value.var="votes") | |
#transform #votes into percentages | |
republican_wide[, 3:4] = republican_wide[, 3:4] / (republican_wide[,3] + republican_wide[,4]) | |
#transform back into long format | |
republican_long = republican_wide %>% reshape2::melt(id.vars = c("state.name", "county.name"), | |
variable.name = "candidate", | |
value.name = "votes.percentage") | |
###Democrats | |
#keep Trump and Cruz only | |
democrat = all_results %>% filter(candidate == "H. Clinton" | candidate == "B. Sanders") | |
#dcast to get total votes per county | |
democrat_wide = democrat %>% reshape2::dcast(state.name + county.name ~ candidate, value.var="votes") | |
#transform #votes into percentages | |
democrat_wide[, 3:4] = democrat_wide[, 3:4] / (democrat_wide[,3] + democrat_wide[,4]) | |
#transform back into long format | |
democrat_long = democrat_wide %>% reshape2::melt(id.vars = c("state.name", "county.name"), | |
variable.name = "candidate", | |
value.name = "votes.percentage") | |
#recombine republican & democrat data | |
all_results_long = rbind(republican_long, democrat_long) | |
all_results_long[is.na(all_results_long$votes.percentage),"votes.percentage"] = 0 | |
#final data, containing the votes percentages and demographics data per county | |
all_results_long = all_results_long %>% | |
left_join(all.regions) %>% | |
left_join(demographics) | |
#write.csv(all_results_long, "all_results_long.csv", row.names=FALSE) | |
#candidate.name = "B. Sanders" | |
plot_map = function(candidate.name, variable){ | |
candidate = all_results_long[all_results_long$candidate == candidate.name,] %>% | |
dplyr::mutate(votes.index = votes.percentage / median(votes.percentage)) %>% | |
mutate(votes.indicator = ifelse(votes.index >= 1, 1, -1)) | |
candidate$var.index = candidate[,variable] / median(candidate[,variable]) | |
candidate$var.indicator = ifelse(candidate$var.index >= 1, 2, 1) | |
candidate$value_tmp = as.factor(candidate$votes.indicator * candidate$var.indicator) | |
candidate$value = sapply(candidate$value_tmp, | |
function(x){ | |
if(x == -2){ | |
paste("a. low popularity - high", variable) | |
} | |
else if(x==-1){ | |
paste("b. low popularity - low", variable) | |
} | |
else if(x==1){ | |
paste("c. high popularity - low", variable) | |
} | |
else if(x==2){ | |
paste("d. high popularity - high", variable) | |
} | |
}) | |
my_palette = brewer.pal(4, "PRGn") | |
choro = CountyChoropleth$new(candidate) | |
choro$title = candidate.name | |
choro$ggplot_scale = scale_fill_manual(values=my_palette) | |
choro$render() | |
} | |
plot_map("B. Sanders", "per_capita_income") | |
plot_map("H. Clinton", "per_capita_income") | |
plot_map("D. Trump", "per_capita_income") | |
plot_map("D. Trump", "percent_white") | |
plot_map("D. Trump", "percent_hispanic") | |
plot_map("T. Cruz", "percent_hispanic") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment