Skip to content

Instantly share code, notes, and snippets.

@sergiospagnuolo
Last active February 17, 2020 13:31
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sergiospagnuolo/d28593be4772731ff2b8b45ba651aba5 to your computer and use it in GitHub Desktop.
Save sergiospagnuolo/d28593be4772731ff2b8b45ba651aba5 to your computer and use it in GitHub Desktop.
Códigos para extração de dados da API do Portal da Transparência e também raspa descrições do site (que não constam na API)
library(rvest)
ids_documentos <- list('01413' , '01463' , '02652' , '02769' , '01397' , '01449' , '01400' , '01452' , '02757' , '01417' , '01417' , '01450' , '01406' , '01437' , '01407' , '01436' , '01410' , '01440' , '01377' , '01464' , '01394' , '01454' , '01382' , '00010' , '01383' , '00011' , '01414' , '01446' , '01372' , '01476' , '00009' , '01385' , '00012' , '01379' , '01466' , '01392' , '01611' , '00015' , '01398' , '01456' , '01408' , '01457' , '01374' , '01477' , '01418' , '01415' , '01447' , '01401' , '01453' , '01404' , '01443' , '01395' , '01455' , '01384' , '01469' , '01389' , '00014' , '01391' , '01473' , '01386' , '01470' , '01390' , '01472' , '01396' , '01481' , '00017' , '01373' , '01475' , '01371' , '01474' , '01403' , '01444' , '01375' , '01478' , '01416' , '01448' , '02886' , '00006' , '01459' , '01387' , '00013' , '01393' , '00016' , '01399' , '01451' , '01409' , '01441' , '02758' , '00007' , '01460' , '01411' , '01439' , '01402' , '01445' , '01380' , '01467' , '01370' , '01438' , '01610' , '01378' , '01465' , '01412' , '01462' , '01388' , '01471' , '01381' , '01468' , '00008' , '01461' , '01376' , '01479' , '00005' , '01458' , '01405' , '01442' , '00072' , '01632' , '01988' , '01817' , '01636' , '02000' , '01639' , '02003' , '02001' , '01637' , '01645' , '02009' , '01646' , '01648' , '01654' , '01969' , '01979' , '01623' , '01992' , '01614' , '01615' , '01993' , '02158' , '01633' , '02775' , '01989' , '01650' , '01964' , '01616' , '01994' , '01971' , '01656' , '01997' , '01611' , '01625' , '01981' , '01844' , '01982' , '01626' , '01651' , '01966' , '01634' , '01990' , '02004' , '01640' , '01643' , '02007' , '01980' , '01624' , '01613' , '01974' , '01996' , '01618' , '01978' , '01622' , '01975' , '01619' , '01621' , '01977' , '01999' , '01609' , '01649' , '02011' , '01659' , '02010' , '01642' , '02006' , '01652' , '01967' , '01991' , '01635' , '01984' , '01628' , '01617' , '01995' , '01608' , '01998' , '02002' , '01638' , '01647' , '01629' , '01985' , '01612' , '02005' , '01641' , '01657' , '01972' , '01944' , '01655' , '01970' , '01631' , '01987' , '01976' , '01620' , '01658' , '01973' , '01986' , '01630' , '01653' , '01968' , '01983' , '01627' , '02008' , '01644')
total <- list()
for (a in ids_documentos){
# monta a URL
url <- paste0("http://www.portaltransparencia.gov.br/despesas/pagamento/160313000012017OB8", a, "?ordenarPor=fase&direcao=desc")
# carrega a URL montada
d <- read_html(url)
# cria os dados
descricoes <- d %>%
html_nodes(xpath = "//div[2]/section[1]/div[3]/div/span") %>%
html_text()
# preenche a lista vazia com as iterações
total[[a]] <- descricoes
}
# transforma em data frame de coluna única, renomeia coluna
descricoes = as.data.frame(unlist(total))
colnames(documentos)[1] = "descricao"
library(jsonlite)
library(tidyverse)
# carrega as URLs com endpoints em questão, com dados de diárias investigadas
a20 <- "http://www.portaltransparencia.gov.br/despesas/documento/documentos-relacionados/resultado?paginacaoSimples=false&tamanhoPagina=5000&offset=0&direcaoOrdenacao=desc&colunaOrdenacao=fase&colunasSelecionadas=data%2Cfase%2CdocumentoResumido%2Cespecie&fase=Empenho&codigo=160313000012017NE000132&_=1556307080580"
a70 <- "http://www.portaltransparencia.gov.br/despesas/documento/documentos-relacionados/resultado?paginacaoSimples=false&tamanhoPagina=5000&offset=0&direcaoOrdenacao=desc&colunaOrdenacao=fase&colunasSelecionadas=data%2Cfase%2CdocumentoResumido%2Cespecie&fase=Empenho&codigo=167313000012017NE000004&_=1556307132286"
a710 <- "http://www.portaltransparencia.gov.br/despesas/documento/documentos-relacionados/resultado?paginacaoSimples=false&tamanhoPagina=5000&offset=0&direcaoOrdenacao=desc&colunaOrdenacao=fase&colunasSelecionadas=data%2Cfase%2CdocumentoResumido%2Cespecie&fase=Empenho&codigo=160313000012017NE000088&_=1556307179549"
# cria as listas com dados
d20 <- jsonlite::fromJSON(a20)
d70 <- jsonlite::fromJSON(a70)
d710 <- jsonlite::fromJSON(a710)
# cria o data frame
c20 <- d20[["data"]]
c70 <- d70[["data"]]
c710 <- d710[["data"]]
# junta o tabelão
diarias <- bind_rows(c20, c70, c710)
# cria uma coluna nova para explicar do que se trata
diarias["tipo"] <- "diarias"
write.csv(diarias, "diarias.csv")
# Diarias no pais
a470 <- "http://www.portaltransparencia.gov.br/despesas/documento/documentos-relacionados/resultado?paginacaoSimples=false&tamanhoPagina=5000&offset=0&direcaoOrdenacao=desc&colunaOrdenacao=fase&colunasSelecionadas=data%2Cfase%2CdocumentoResumido%2Cespecie&fase=Empenho&codigo=160313000012017NE000133&_=1556562119599"
d470 <- jsonlite::fromJSON(a470)
diarias_npais <- d470[["data"]]
diarias_npais["tipo"] <- "diarias"
write.csv(diarias_npais, "diarias_npais.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment