Last active
November 19, 2019 21:00
-
-
Save grantmcdermott/d46be9a9288fbe1f40d5a63aead12f8a to your computer and use it in GitHub Desktop.
Brazilian crime data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Context: https://twitter.com/hsantanna/status/1196184374481506304 | |
## Thanks to Will May: https://twitter.com/williamcmay/status/1196268860418281472?s=20 | |
library(tidyverse) | |
library(rvest) | |
library(memoise) | |
ssp_url = 'http://www.ssp.sp.gov.br/estatistica/pesquisa.aspx' | |
## get the region/municipality form | |
sess = html_session(ssp_url) | |
form = html_form(sess)[[2]] | |
## see some form (POST) options | |
head(form$fields$`ctl00$conteudo$ddlRegioes`$options) | |
head(form$fields$`ctl00$conteudo$ddlMunicipios`$options) | |
## Scraping function that can be used to extract crime data for different | |
## municipalities | |
scrape_func = | |
function(m) { | |
## submit the form to get a new page | |
form_m = form %>% set_values('ctl00$conteudo$ddlMunicipios' = m) | |
sess_m = submit_form(sess, form_m) | |
tabs_m = | |
## Loop over the three year tables (2017-2019) for each municipality | |
map_df( | |
0:2, | |
function(i) { | |
Year = sess_m %>% html_node(paste0("#conteudo_repAnos_lbAno_", i)) %>% html_text() | |
tab_i = | |
sess_m %>% | |
html_node(paste0("#conteudo_repAnos_gridDados_", i)) %>% | |
html_table() %>% | |
bind_rows() %>% | |
## Fix annoying NA elipsis | |
mutate_at(vars(Jan:Total), ~na_if(., "...")) %>% | |
## Ignore period '000 delimiter %>% | |
mutate_at(vars(Jan:Total), function(x) as.double(gsub("\\.", "", x))) | |
tab_i$Year = Year | |
return(tab_i) | |
} | |
) | |
tabs_m$Municipality = names(munis[as.integer(m)]) | |
return(tabs_m) | |
Sys.sleep(2) ## Be nice to host server | |
} | |
## Let's create a memoised version of this function that remembers our saved | |
## results in case we get interupted midway through. | |
scrape_func_mem = memoise(scrape_func) | |
## Get vector of all municipalities 646(-1) municipalities. This will serve | |
## as the input to our function | |
munis = form$fields$`ctl00$conteudo$ddlMunicipios`$options | |
## Remove todos ("all") case | |
munis = munis[-1] | |
## Use examples: | |
ssp = map_df(munis[1:2], scrape_func_mem) ## First time: Will need to scrape both tables | |
ssp = map_df(munis[1:2], scrape_func_mem) ## Second time: Will load memoised results instantly | |
ssp = map_df(munis[1:4], scrape_func_mem) ## Will only scrape third and fourth tables that aren't already memoised. | |
## To whole shebang and get it to print nicely, you can run | |
# ssp = | |
# map_df(scrape_func_mem) %>% | |
# as_tibble() %>% | |
# select(Municipality, Year, everything()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment