Last active
March 23, 2019 02:00
-
-
Save jjesusfilho/8c4235a51c7b9d66d877c54fd8a0454f to your computer and use it in GitHub Desktop.
Brazilian cuture ministry data - Dados do ministério da cultura - SalicNet
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
salicnet<-function(cnpj){ | |
url<-"http://sistemas.cultura.gov.br/salicnet/conDadosCadastraisProponente/conDadosCadastraisProponente.php" | |
cnpj<-stringr::str_remove_all(cnpj,"\\D+") | |
nmgp_parms=paste0("cgccpf?#?",cnpj,"?@?NM_btn_insert?#?S?@?NM_btn_update?#?S?@?NM_btn_delete?#?S?@?NM_btn_navega?#?S?@?") | |
h <- curl::new_handle() | |
curl::handle_setheaders(h, .list=(charset="iso-8859-1")) | |
lista<-vector("list",length(cnpj)) ## coloca o n\u00famero total aqui. | |
for(i in seq_along(lista)){ | |
Sys.sleep(2) | |
tryCatch({ | |
curl::handle_setform(h, | |
nmgp_parms=nmgp_parms[i], | |
nmgp_url_saida= "/salicnet/ctrDadosCadastraisProponente/ctrDadosCadastraisProponente.php", | |
script_case_init= "1") | |
req <- curl::curl_fetch_memory(url, handle = h) | |
conteudo<-req$content %>% | |
rawToChar() %>% | |
iconv(from = "ISO-8859-1", to = "UTF-8") %>% | |
xml2::read_html() | |
identificacao<-rvest::html_nodes(conteudo,xpath='//table[@id="hidden_bloco_0_1"]/tr[position() > 1]') %>% | |
rvest::html_text() %>% | |
stringr::str_squish() %>% | |
stringr::str_split("\r\n") | |
cnpj<-identificacao[[1]] %>% stringr::str_extract("(?<=CPF\\s).*?(?=\\s)") | |
nome<-identificacao[[1]] %>% stringr::str_extract("(?<=Nome\\s).*") | |
responsavel<-identificacao[[2]] %>% stringr::str_extract("(?<=Respons\u00e1vel.).*") | |
logradouro<-rvest::html_nodes(conteudo,xpath='//table[@id="hidden_bloco_1_1"]/tr[position() > 1]') %>% | |
rvest::html_text() %>% | |
stringr::str_squish() %>% | |
stringr::str_split("\r\n") | |
endereco<-logradouro[[1]] %>%stringr::str_extract("(?<=Logradouro ).*") | |
uf<-logradouro[[2]] %>% stringr::str_extract("(?<=Proponente ).*?(?= CEP)") | |
cep<-logradouro[[2]] %>% stringr::str_extract("(?<=CEP ).*") | |
email<-logradouro[[3]] %>% stringr::str_extract("(?<=Email ).*") | |
telefones<-rvest::html_nodes(conteudo,xpath='//table[@id="hidden_bloco_2_1"]/tr[position() > 1]') %>% | |
rvest::html_text() %>% | |
stringr::str_squish() %>% | |
stringr::str_split("\r\n") | |
residencial<-telefones[[1]] %>% stringr::str_extract("(?<=Residencial ).*?(?=Comercial)") | |
comercial<-telefones[[1]] %>% stringr::str_extract("(?<=Comercial ).*?(?= Celular)") | |
celular<-telefones[[1]] %>% stringr::str_extract("(?<=Celular ).*?(?= Fax)") | |
fax<-telefones[[1]] %>% stringr::str_extract("(?<=Fax ).*") | |
df<-tibble::tibble(cnpj,nome,responsavel,endereco,uf,cep,email,residencial,comercial,celular,fax) | |
lista[[i]]<-df | |
}, error=function(e){ | |
e | |
}, finally={ | |
next | |
}) | |
} | |
dplyr::bind_rows(lista) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment