Last active
December 24, 2019 15:58
-
-
Save smach/99870be4e039f46f7c82f7fc03196e6a to your computer and use it in GitHub Desktop.
Code to geocode courthouse list
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 1. Scrape tables. You can do that with the Table Capture Chrome extension, or you can do it with R. | |
# Download the entire html document so I don't need to keep hammering the Wikipedia server | |
library(htmltab) | |
library(rvest) | |
library(purrr) | |
library(dplyr) | |
library(rgeocodio) | |
url <- "https://en.wikipedia.org/wiki/List_of_United_States_federal_courthouses" | |
download.file(url, "federalcourthouses.html") | |
# 2. Get all the table headlines by using SelectorGadget to find the right css, and then use the rvest package to scrape that. I cleaned up a couple of issues with the results -- Georgia had multiple headlines and Guam had two (one saying "United States territories" and the second saying "Guam"), as well as headers for things like references and external links. | |
header_css <- ".mw-headline" | |
my_headers <- read_html("federalcourthouses.html") %>% | |
html_nodes(header_css) %>% | |
html_text() | |
my_headers <- my_headers[1:58] | |
my_headers[12:15] <- "Georgia" | |
my_headers <- my_headers[-56] | |
my_headers <- my_headers[-11] | |
# 3. I now know there are 57 tables I want (based on my number of headers). I'll create a function to use the htmltab package's htmbtab() function to scrape a table and add the state column based on the headline for that table. I'm also accounting for a few situations where the city or street address are missing. | |
get_table <- function(tablenum, headervec = my_headers, htmlfile = "federalcourthouses.html"){ | |
tabledata <- htmltab(htmlfile, tablenum) | |
table_cols <- names(tabledata) | |
'%ni%' <- Negate('%in%') | |
if ("City" %ni% table_cols){ | |
tabledata$City <- tabledata$Location | |
} | |
tabledata <- tabledata %>% | |
mutate( | |
State = headervec[tablenum], | |
Place = paste0(`Street address`, ", ", City, ", ", State), | |
Place = ifelse(`Street address` == "?", paste0(Courthouse, ", ", City, ", ", State), Place) | |
) | |
return(tabledata) | |
} | |
# 4. Now I'll iterate over the html file using purrr's map_df(), running my function on each of the headers to get the corresponding tables and creating a single data frame: | |
my_tables <- purrr::map_df(1:length(my_headers), get_table) | |
# 5. Create a new data frame with lon and lat columns using rgeocodio's gio_batch_geocode() function: | |
my_geocoded_data <- rgeocodio::gio_batch_geocode(my_tables$Place) | |
# I know there's an easier way to deal with list columns within a data frame, but this worked for me: | |
# Write a function to extract what I want | |
get_lat_or_lng <- function(mynum, latlon, mydf = my_geocoded_data) { | |
if(latlon == "lat") { | |
myval <- mydf$response_results[[mynum]]$location.lat[1] | |
} | |
if(latlon == "lng") { | |
myval <- mydf$response_results[[mynum]]$location.lng[1] | |
} | |
if(is.null(myval)) { | |
myval <- NA | |
} | |
return(myval) | |
} | |
# Get vectors for address, lon and lat: | |
my_geocoded_addresses <- my_geocoded_data$query | |
my_lons <- purrr::map_dbl(1:nrow(my_geocoded_data), get_lat_or_lng, latlon = "lng") | |
my_lats <- purrr::map_dbl(1:nrow(my_geocoded_data), get_lat_or_lng, latlon = "lat") | |
# Create a new data frame with address, lon, and lat | |
my_geocoded_df <- data.frame(Place = my_geocoded_addresses, lon = my_lons, lat = my_lats, stringsAsFactors = FALSE) | |
# Join the geospatial data to the original my_tables data frame: | |
my_results <- left_join(my_tables, my_geocoded_df) | |
# Export results | |
rio::export(my_results, file = "courthouses.xlsx") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment