Skip to content

Instantly share code, notes, and snippets.

@jtrecenti
Last active August 29, 2015 13:56
Show Gist options
  • Save jtrecenti/9136801 to your computer and use it in GitHub Desktop.
Save jtrecenti/9136801 to your computer and use it in GitHub Desktop.
require(stringr)
pega_do <- function(yyyy,mm,dd) {
wget <- sprintf('wget --no-check-certificate \'https://www.dje.tjsp.jus.br/cdje/downloadCaderno.do?dtDiario=%02d/%02d/%d&cdCaderno=14\' -O pdf_do/%d_%02d_%02d.pdf', dd, mm, yyyy, yyyy, mm, dd)
if(!file.exists(sprintf('pdf_do/%d_%02d_%02d.pdf',yyyy,mm,dd))) {
system(wget, ignore.stdout=T, ignore.stderr=T, wait=T)
}
}
pdf2txt_do <- function(arq, verbose=F) {
if(file.info(arq)$size > 20000 & !file.exists(gsub('pdf','txt',arq))) {
if(verbose) print(arq)
pdf2txt <- sprintf('pdf2txt -W 1000 %s > %s', arq, gsub('pdf','txt',arq))
system(pdf2txt, wait=T)
return(arq)
}
}
pega_proc_do <- function(arq) {
txt <- readChar(arq, nchars=file.info(arq)$size)
txt <- gsub(' |\n', '', txt)
txt <- gsub('PublicaçãoOficialdoTribunaldeJustiçadoEstadodeSãoPaulo-LeiFederalnº11.419/06,art.4º', '', txt)
txt <- gsub('Disponibilização:[a-zçA-Z]+-feira,[0-9]{1,2}de[A-Za-zç]+de20[0-9]{2}', '', txt)
txt <- gsub('DiáriodaJustiçaEletrônico-CadernoEditaiseLeilões','', txt, fixed=T)
txt <- gsub('SãoPaulo,Ano[VI]+-Edição[0-9]{1,4}', '', txt)
paginas <- str_count(txt,'.{10}\f.{10}')
numeros <- 2:(paginas+1)
for(i in numeros) {
txt <- str_replace(txt, paste0('\f', i), '')
}
cnj <- '[0-9]{7}-[0-9]{2}\\.20[0-9]{2}\\.8.26\\.[0-9]{4}'
antigo_prodesp <- '[0-9]{3}\\.[0-9]{2}\\.[2019]{2}[0-9]{2}\\.[0-9]{6}(-|/)[0-9]{1}((/|-)[0-9]{6}-[0-9]{3})?'
antigo_saj <- '[0-9]{3}\\.[901]{1}[0-9]{1}\\.[0-9]{6}(-|/)[0-9]{1}((/|\\(|-)[0-9]{1,5}(/[0-9]{2})?)?)?'
processos <- str_extract_all(txt,paste(cnj,antigo_prodesp,antigo_saj,sep='|'))[[1]]
processos <- unique(processos)
sort(processos)
}
### FOLDERS
system('mkdir pdf_do')
system('mkdir txt_do')
### TEST
system.time(pega_do(2009,10,9))
system.time(pdf2txt_do('pdf_do/2009_10_09.pdf'))
system.time(p <- pega_proc_do('txt_do/2009_10_09.txt'))
### NOT RUN
for(i in 2007:2014){for(j in 1:12){for(k in 1:31){pega_do(i,j,k)}}}
arqs_do <- as.character(unlist(sapply(list.files('pdf_do'), pdf2txt_do)))
processos <- lapply(list.files('txt_do',full.names=T), pega_proc_do)
### END NOT RUN
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment