Skip to content

Instantly share code, notes, and snippets.

@grantmcdermott
Last active November 19, 2019 21:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save grantmcdermott/d46be9a9288fbe1f40d5a63aead12f8a to your computer and use it in GitHub Desktop.
Save grantmcdermott/d46be9a9288fbe1f40d5a63aead12f8a to your computer and use it in GitHub Desktop.
Brazilian crime data
## Context: https://twitter.com/hsantanna/status/1196184374481506304
## Thanks to Will May: https://twitter.com/williamcmay/status/1196268860418281472?s=20
library(tidyverse)
library(rvest)
library(memoise)
ssp_url = 'http://www.ssp.sp.gov.br/estatistica/pesquisa.aspx'
## get the region/municipality form
sess = html_session(ssp_url)
form = html_form(sess)[[2]]
## see some form (POST) options
head(form$fields$`ctl00$conteudo$ddlRegioes`$options)
head(form$fields$`ctl00$conteudo$ddlMunicipios`$options)
## Scraping function that can be used to extract crime data for different
## municipalities
scrape_func =
function(m) {
## submit the form to get a new page
form_m = form %>% set_values('ctl00$conteudo$ddlMunicipios' = m)
sess_m = submit_form(sess, form_m)
tabs_m =
## Loop over the three year tables (2017-2019) for each municipality
map_df(
0:2,
function(i) {
Year = sess_m %>% html_node(paste0("#conteudo_repAnos_lbAno_", i)) %>% html_text()
tab_i =
sess_m %>%
html_node(paste0("#conteudo_repAnos_gridDados_", i)) %>%
html_table() %>%
bind_rows() %>%
## Fix annoying NA elipsis
mutate_at(vars(Jan:Total), ~na_if(., "...")) %>%
## Ignore period '000 delimiter %>%
mutate_at(vars(Jan:Total), function(x) as.double(gsub("\\.", "", x)))
tab_i$Year = Year
return(tab_i)
}
)
tabs_m$Municipality = names(munis[as.integer(m)])
return(tabs_m)
Sys.sleep(2) ## Be nice to host server
}
## Let's create a memoised version of this function that remembers our saved
## results in case we get interupted midway through.
scrape_func_mem = memoise(scrape_func)
## Get vector of all municipalities 646(-1) municipalities. This will serve
## as the input to our function
munis = form$fields$`ctl00$conteudo$ddlMunicipios`$options
## Remove todos ("all") case
munis = munis[-1]
## Use examples:
ssp = map_df(munis[1:2], scrape_func_mem) ## First time: Will need to scrape both tables
ssp = map_df(munis[1:2], scrape_func_mem) ## Second time: Will load memoised results instantly
ssp = map_df(munis[1:4], scrape_func_mem) ## Will only scrape third and fourth tables that aren't already memoised.
## To whole shebang and get it to print nicely, you can run
# ssp =
# map_df(scrape_func_mem) %>%
# as_tibble() %>%
# select(Municipality, Year, everything())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment