Skip to content

Instantly share code, notes, and snippets.

@jjesusfilho
Last active March 23, 2019 02:00
Show Gist options
  • Save jjesusfilho/8c4235a51c7b9d66d877c54fd8a0454f to your computer and use it in GitHub Desktop.
Save jjesusfilho/8c4235a51c7b9d66d877c54fd8a0454f to your computer and use it in GitHub Desktop.
Brazilian cuture ministry data - Dados do ministério da cultura - SalicNet
salicnet<-function(cnpj){
url<-"http://sistemas.cultura.gov.br/salicnet/conDadosCadastraisProponente/conDadosCadastraisProponente.php"
cnpj<-stringr::str_remove_all(cnpj,"\\D+")
nmgp_parms=paste0("cgccpf?#?",cnpj,"?@?NM_btn_insert?#?S?@?NM_btn_update?#?S?@?NM_btn_delete?#?S?@?NM_btn_navega?#?S?@?")
h <- curl::new_handle()
curl::handle_setheaders(h, .list=(charset="iso-8859-1"))
lista<-vector("list",length(cnpj)) ## coloca o n\u00famero total aqui.
for(i in seq_along(lista)){
Sys.sleep(2)
tryCatch({
curl::handle_setform(h,
nmgp_parms=nmgp_parms[i],
nmgp_url_saida= "/salicnet/ctrDadosCadastraisProponente/ctrDadosCadastraisProponente.php",
script_case_init= "1")
req <- curl::curl_fetch_memory(url, handle = h)
conteudo<-req$content %>%
rawToChar() %>%
iconv(from = "ISO-8859-1", to = "UTF-8") %>%
xml2::read_html()
identificacao<-rvest::html_nodes(conteudo,xpath='//table[@id="hidden_bloco_0_1"]/tr[position() > 1]') %>%
rvest::html_text() %>%
stringr::str_squish() %>%
stringr::str_split("\r\n")
cnpj<-identificacao[[1]] %>% stringr::str_extract("(?<=CPF\\s).*?(?=\\s)")
nome<-identificacao[[1]] %>% stringr::str_extract("(?<=Nome\\s).*")
responsavel<-identificacao[[2]] %>% stringr::str_extract("(?<=Respons\u00e1vel.).*")
logradouro<-rvest::html_nodes(conteudo,xpath='//table[@id="hidden_bloco_1_1"]/tr[position() > 1]') %>%
rvest::html_text() %>%
stringr::str_squish() %>%
stringr::str_split("\r\n")
endereco<-logradouro[[1]] %>%stringr::str_extract("(?<=Logradouro ).*")
uf<-logradouro[[2]] %>% stringr::str_extract("(?<=Proponente ).*?(?= CEP)")
cep<-logradouro[[2]] %>% stringr::str_extract("(?<=CEP ).*")
email<-logradouro[[3]] %>% stringr::str_extract("(?<=Email ).*")
telefones<-rvest::html_nodes(conteudo,xpath='//table[@id="hidden_bloco_2_1"]/tr[position() > 1]') %>%
rvest::html_text() %>%
stringr::str_squish() %>%
stringr::str_split("\r\n")
residencial<-telefones[[1]] %>% stringr::str_extract("(?<=Residencial ).*?(?=Comercial)")
comercial<-telefones[[1]] %>% stringr::str_extract("(?<=Comercial ).*?(?= Celular)")
celular<-telefones[[1]] %>% stringr::str_extract("(?<=Celular ).*?(?= Fax)")
fax<-telefones[[1]] %>% stringr::str_extract("(?<=Fax ).*")
df<-tibble::tibble(cnpj,nome,responsavel,endereco,uf,cep,email,residencial,comercial,celular,fax)
lista[[i]]<-df
}, error=function(e){
e
}, finally={
next
})
}
dplyr::bind_rows(lista)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment