Last active
June 23, 2017 23:06
-
-
Save jjesusfilho/cb4a5ec91d6c7dbc23ca095cd054b8c5 to your computer and use it in GitHub Desktop.
scraper do Tribunal de Justiça do Rio Grande do Sul
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(httr) | |
library(xml2) | |
library(stringr) | |
library(boilerpipeR) | |
tjrsSG_meta<-function(BuscaLivre="",quote=TRUE){ | |
if(quote==TRUE) BuscaLivre<-deparse(BuscaLivre) | |
url<-"http://www.tjrs.jus.br/busca/search?" | |
query<-list(q = "", proxystylesheet = "tjrs_index", getfields = "*", | |
entsp = "a__politica-site", wc = "200", wc_mc = "1", oe = "UTF-8", | |
ie = "UTF-8", ud = "1", sort = "date:D:S:d1", as_qj = "", | |
as_epq = "", as_oq = "", as_eq = "", as_q = "", ulang = "en", | |
ip = "", access = "p", entqr = "3", entqrm = "0", client = "tjrs_index", | |
filter = "0", start = "0", aba = "juris", site = "juris") | |
query[[1]]<-BuscaLivre | |
query[[11]]<-BuscaLivre | |
a<-url %>% | |
GET(query=query) %>% | |
content("parsed") | |
num<-a %>% xml_find_all("//*[@class='clearfix left']/div/*[@class='bold'][3]") %>% | |
xml_text() %>% | |
as.numeric() | |
url1<-a %>% xml_find_all("//*[@class='pagination-control']/a[1]/@href") %>% | |
xml_text() %>% | |
.[1] %>% | |
paste0("http://www.tjrs.jus.br/busca/",.) | |
df<-data.frame() | |
for (i in seq(0,num,10)){ | |
tryCatch({ | |
url1<-str_replace(url1,"(?<=start\\=)\\d+",as.character(i)) | |
b<-GET(url1) %>% | |
content("parsed") | |
processo<-xml_find_all(b,"//*[@class='featured font-size-12']") %>% xml_text() | |
data.julgamento<- | |
orgao.julgador<- xml_find_all(b,"//*[@class='larguraUltColuna']") %>% | |
xml_text(trim=T) %>% | |
str_replace(".*(:\\s)","") | |
classe.processual<-xml_find_all(b,"//*[@id='table_resultado']//tr[2]/td[1]") %>% | |
xml_text(trim=T) %>% | |
str_replace(".*(:\\s)","") | |
secao<-xml_find_all(b,"//*[@id='table_resultado']//tr[3]/td[2]") %>% | |
xml_text(trim=T) %>% | |
str_replace(".*(:\\s)","") | |
relator<-xml_find_all(b,"//*[@id='table_resultado']//tr[5]/td[1]") %>% | |
xml_text(trim=T) %>% | |
str_replace(".*(:\\s)","") | |
comarca.origem <-xml_find_all(b,"//*[@id='table_resultado']//tr[2]/td[2]") %>% | |
xml_text(trim=T) %>% | |
str_replace(".*(:\\s)","") | |
classe.material<-xml_find_all(b,"//*[@id='table_resultado']//tr[4]/td[2]") %>% | |
xml_text(trim=T) %>% | |
str_replace(".*(:\\s)","") | |
ementa<-xml_find_all(b,"//*[@class='ementa']") %>% | |
xml_text(trim=T) %>% | |
str_replace(".*(:\\s)","") | |
linkHtml<-xml_find_all(b,"//*[@class='larguraPrimColuna']//a[2]/@href") %>% | |
xml_text() %>% | |
str_c("http://www.tjrs.jus.br/busca/",.) %>% | |
str_replace_all("\\s+","%20") | |
df1<-data.frame(processo,orgao.julgador,relator, classe.processual,classe.material,secao,comarca.origem,ementa, pagina=i,linkHtml,stringsAsFactors = F) | |
df<-rbind(df,df1) | |
}, error=function(m){ | |
m | |
}, finally={ | |
next | |
}) | |
} | |
return(df) | |
} | |
inteiroRS<-dfrs$linkHtml %>% | |
map(function(x){ | |
GET(x) %>% | |
content("text") %>% | |
DefaultExtractor() | |
}) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment