klahrich/county_results.R

## county_results.R
library(rvest)
library(magrittr)
library(stringr)
library(lubridate)
library(reshape2)
library(ggplot2)
library(dplyr)
library(ggthemes)
library(choroplethr)
library(choroplethrMaps)
library(acs)
library(RColorBrewer)


### Given a county node, return a dataframe with the results for that county
county_results = function(county){

  county_name = county %>% html_node(".results-header div.title") %>% html_text() %>% str_trim()

  results = county %>% html_nodes(".results-dataset")

  results_table = results %>% html_nodes(".results-table") %>% html_table()

  #the state's results are available (for at least one party)
  if(length(results_table) > 0){

    results_meta = results %>% html_nodes(".results-meta h5") %>% html_text() %>% str_trim()

    for(i in 1:length(results_table)){
      colnames(results_table[[i]]) = c("candidate", "votes.pct", "votes")
      results_table[[i]]$candidate = str_trim(str_replace(results_table[[i]]$candidate, "Winner", ""))
      results_table[[i]]$votes = as.numeric(str_replace_all(results_table[[i]]$votes, ",", ""))
      results_table[[i]]$party = results_meta[i]
      results_table[[i]]$county.name = str_trim(str_replace(tolower(county_name), "county", ""))
    }

    do.call(rbind, results_table)
  }
  #state hasnt voted yet
  else{
    NULL
  }
}

### Read the html code for 1 state, calling the county_results function for each of its counties
state_results = function(state_filename){
  html = read_html(paste0("states/",state_filename))
  county_nodes = html %>% html_nodes("article.results-group")

  results = NULL

  for(i in 1:length(county_nodes)){
    new_result = try(county_results(county_nodes[[i]]))
    results = rbind(results, new_result)
  }

  results$state.name = str_trim(tolower(str_split(state_filename, "\\.")[[1]][1]))

  results
}


### Read county data from choroplethr package in order to the region variable for the map.
### also fix a few county names to be able to merge properly
data(county.regions)
all.regions = county.regions %>% select(county.name, state.name, region)
all.regions["1528", "county.name"] = "st. louis city"
all.regions["2953", "county.name"] = "fairfax city"
all.regions["2906", "county.name"] = "richmond city"
all.regions["2844", "county.name"] = "franklin city"
all.regions["2924", "county.name"] = "roanoke city"

### Read demographics data
demographics=get_county_demographics()
#write.csv(demographics, "county_demographics.csv", row.names=FALSE)
#setwd("C:/Users/karim/Documents/R/usa_2016")
#demographics = read.csv("county_demographics.csv")

### note: I removed Kansas and Minesotta from the states directory because they are by district
#all_results = read.csv("all_results.csv", stringsAsFactors=FALSE)
all_results = do.call(rbind, lapply(list.files("states/"), state_results))
write.csv(all_results, "all_results.csv", row.names=FALSE)

### Fix a few county names. This is a bit ugly...
all_results = all_results %>%
              mutate(county.name = ifelse(county.name == "saint francis", "st. francis", county.name)) %>%
              mutate(county.name = ifelse(county.name == "dewitt" & state.name == "illinois", "de witt", county.name)) %>%
              mutate(county.name = str_replace(county.name, " parish", "")) %>%
              mutate(county.name = ifelse(state.name=="virginia" & !str_detect(county.name, "charles")
                                          & !str_detect(county.name, "james")
                                          & !str_detect(county.name, "fairfax")
                                          & !str_detect(county.name, "richmond")
                                          & !str_detect(county.name, "roanoke")
                                          & !str_detect(county.name, "franklin"),
                                          str_replace(county.name, " city", ""),
                                          county.name))

###Republicans
#keep Trump and Cruz only
republican = all_results %>% filter(candidate == "D. Trump" | candidate == "T. Cruz")

#dcast to get total votes per county
republican_wide = republican %>% reshape2::dcast(state.name + county.name ~ candidate, value.var="votes")

#transform #votes into percentages
republican_wide[, 3:4] = republican_wide[, 3:4] / (republican_wide[,3] + republican_wide[,4])

#transform back into long format
republican_long = republican_wide %>% reshape2::melt(id.vars = c("state.name", "county.name"),
                                                     variable.name = "candidate",
                                                     value.name = "votes.percentage")

###Democrats
#keep Trump and Cruz only
democrat = all_results %>% filter(candidate == "H. Clinton" | candidate == "B. Sanders")

#dcast to get total votes per county
democrat_wide = democrat %>% reshape2::dcast(state.name + county.name ~ candidate, value.var="votes")

#transform #votes into percentages
democrat_wide[, 3:4] = democrat_wide[, 3:4] / (democrat_wide[,3] + democrat_wide[,4])

#transform back into long format
democrat_long = democrat_wide %>% reshape2::melt(id.vars = c("state.name", "county.name"),
                                                variable.name = "candidate",
                                                value.name = "votes.percentage")

#recombine republican & democrat data
all_results_long = rbind(republican_long, democrat_long)
all_results_long[is.na(all_results_long$votes.percentage),"votes.percentage"] = 0

#final data, containing the votes percentages and demographics data per county
all_results_long = all_results_long %>%
                    left_join(all.regions) %>%
                    left_join(demographics)

#write.csv(all_results_long, "all_results_long.csv", row.names=FALSE)

#candidate.name = "B. Sanders"
plot_map = function(candidate.name, variable){

  candidate = all_results_long[all_results_long$candidate == candidate.name,] %>%
            dplyr::mutate(votes.index = votes.percentage / median(votes.percentage)) %>%
            mutate(votes.indicator = ifelse(votes.index >= 1, 1, -1))

  candidate$var.index = candidate[,variable] / median(candidate[,variable])
  candidate$var.indicator = ifelse(candidate$var.index >= 1, 2, 1)

  candidate$value_tmp = as.factor(candidate$votes.indicator * candidate$var.indicator)

  candidate$value = sapply(candidate$value_tmp,
                           function(x){
                             if(x == -2){
                               paste("a. low popularity - high", variable)
                             }
                             else if(x==-1){
                               paste("b. low popularity - low", variable)
                             }
                             else if(x==1){
                               paste("c. high popularity - low", variable)
                             }
                             else if(x==2){
                               paste("d. high popularity - high", variable)
                             }
                           })

  my_palette = brewer.pal(4, "PRGn")

  choro = CountyChoropleth$new(candidate)

  choro$title = candidate.name
  choro$ggplot_scale = scale_fill_manual(values=my_palette)

  choro$render()
}

plot_map("B. Sanders", "per_capita_income")
plot_map("H. Clinton", "per_capita_income")

plot_map("D. Trump", "per_capita_income")
plot_map("D. Trump", "percent_white")
plot_map("D. Trump", "percent_hispanic")
plot_map("T. Cruz", "percent_hispanic")
	library(rvest)
	library(magrittr)
	library(stringr)
	library(lubridate)
	library(reshape2)
	library(ggplot2)
	library(dplyr)
	library(ggthemes)
	library(choroplethr)
	library(choroplethrMaps)
	library(acs)
	library(RColorBrewer)


	### Given a county node, return a dataframe with the results for that county
	county_results = function(county){

	county_name = county %>% html_node(".results-header div.title") %>% html_text() %>% str_trim()

	results = county %>% html_nodes(".results-dataset")

	results_table = results %>% html_nodes(".results-table") %>% html_table()

	#the state's results are available (for at least one party)
	if(length(results_table) > 0){

	results_meta = results %>% html_nodes(".results-meta h5") %>% html_text() %>% str_trim()

	for(i in 1:length(results_table)){
	colnames(results_table[[i]]) = c("candidate", "votes.pct", "votes")
	results_table[[i]]$candidate = str_trim(str_replace(results_table[[i]]$candidate, "Winner", ""))
	results_table[[i]]$votes = as.numeric(str_replace_all(results_table[[i]]$votes, ",", ""))
	results_table[[i]]$party = results_meta[i]
	results_table[[i]]$county.name = str_trim(str_replace(tolower(county_name), "county", ""))
	}

	do.call(rbind, results_table)
	}
	#state hasnt voted yet
	else{
	NULL
	}
	}

	### Read the html code for 1 state, calling the county_results function for each of its counties
	state_results = function(state_filename){
	html = read_html(paste0("states/",state_filename))
	county_nodes = html %>% html_nodes("article.results-group")

	results = NULL

	for(i in 1:length(county_nodes)){
	new_result = try(county_results(county_nodes[[i]]))
	results = rbind(results, new_result)
	}

	results$state.name = str_trim(tolower(str_split(state_filename, "\\.")[[1]][1]))

	results
	}



	### Read county data from choroplethr package in order to the region variable for the map.
	### also fix a few county names to be able to merge properly
	data(county.regions)
	all.regions = county.regions %>% select(county.name, state.name, region)
	all.regions["1528", "county.name"] = "st. louis city"
	all.regions["2953", "county.name"] = "fairfax city"
	all.regions["2906", "county.name"] = "richmond city"
	all.regions["2844", "county.name"] = "franklin city"
	all.regions["2924", "county.name"] = "roanoke city"

	### Read demographics data
	demographics=get_county_demographics()
	#write.csv(demographics, "county_demographics.csv", row.names=FALSE)
	#setwd("C:/Users/karim/Documents/R/usa_2016")
	#demographics = read.csv("county_demographics.csv")

	### note: I removed Kansas and Minesotta from the states directory because they are by district
	#all_results = read.csv("all_results.csv", stringsAsFactors=FALSE)
	all_results = do.call(rbind, lapply(list.files("states/"), state_results))
	write.csv(all_results, "all_results.csv", row.names=FALSE)

	### Fix a few county names. This is a bit ugly...
	all_results = all_results %>%
	mutate(county.name = ifelse(county.name == "saint francis", "st. francis", county.name)) %>%
	mutate(county.name = ifelse(county.name == "dewitt" & state.name == "illinois", "de witt", county.name)) %>%
	mutate(county.name = str_replace(county.name, " parish", "")) %>%
	mutate(county.name = ifelse(state.name=="virginia" & !str_detect(county.name, "charles")
	& !str_detect(county.name, "james")
	& !str_detect(county.name, "fairfax")
	& !str_detect(county.name, "richmond")
	& !str_detect(county.name, "roanoke")
	& !str_detect(county.name, "franklin"),
	str_replace(county.name, " city", ""),
	county.name))

	###Republicans
	#keep Trump and Cruz only
	republican = all_results %>% filter(candidate == "D. Trump" \| candidate == "T. Cruz")

	#dcast to get total votes per county
	republican_wide = republican %>% reshape2::dcast(state.name + county.name ~ candidate, value.var="votes")

	#transform #votes into percentages
	republican_wide[, 3:4] = republican_wide[, 3:4] / (republican_wide[,3] + republican_wide[,4])

	#transform back into long format
	republican_long = republican_wide %>% reshape2::melt(id.vars = c("state.name", "county.name"),
	variable.name = "candidate",
	value.name = "votes.percentage")

	###Democrats
	#keep Trump and Cruz only
	democrat = all_results %>% filter(candidate == "H. Clinton" \| candidate == "B. Sanders")

	#dcast to get total votes per county
	democrat_wide = democrat %>% reshape2::dcast(state.name + county.name ~ candidate, value.var="votes")

	#transform #votes into percentages
	democrat_wide[, 3:4] = democrat_wide[, 3:4] / (democrat_wide[,3] + democrat_wide[,4])

	#transform back into long format
	democrat_long = democrat_wide %>% reshape2::melt(id.vars = c("state.name", "county.name"),
	variable.name = "candidate",
	value.name = "votes.percentage")

	#recombine republican & democrat data
	all_results_long = rbind(republican_long, democrat_long)
	all_results_long[is.na(all_results_long$votes.percentage),"votes.percentage"] = 0

	#final data, containing the votes percentages and demographics data per county
	all_results_long = all_results_long %>%
	left_join(all.regions) %>%
	left_join(demographics)

	#write.csv(all_results_long, "all_results_long.csv", row.names=FALSE)

	#candidate.name = "B. Sanders"
	plot_map = function(candidate.name, variable){

	candidate = all_results_long[all_results_long$candidate == candidate.name,] %>%
	dplyr::mutate(votes.index = votes.percentage / median(votes.percentage)) %>%
	mutate(votes.indicator = ifelse(votes.index >= 1, 1, -1))

	candidate$var.index = candidate[,variable] / median(candidate[,variable])
	candidate$var.indicator = ifelse(candidate$var.index >= 1, 2, 1)

	candidate$value_tmp = as.factor(candidate$votes.indicator * candidate$var.indicator)

	candidate$value = sapply(candidate$value_tmp,
	function(x){
	if(x == -2){
	paste("a. low popularity - high", variable)
	}
	else if(x==-1){
	paste("b. low popularity - low", variable)
	}
	else if(x==1){
	paste("c. high popularity - low", variable)
	}
	else if(x==2){
	paste("d. high popularity - high", variable)
	}
	})

	my_palette = brewer.pal(4, "PRGn")

	choro = CountyChoropleth$new(candidate)

	choro$title = candidate.name
	choro$ggplot_scale = scale_fill_manual(values=my_palette)

	choro$render()
	}

	plot_map("B. Sanders", "per_capita_income")
	plot_map("H. Clinton", "per_capita_income")

	plot_map("D. Trump", "per_capita_income")
	plot_map("D. Trump", "percent_white")
	plot_map("D. Trump", "percent_hispanic")
	plot_map("T. Cruz", "percent_hispanic")