Skip to content

Instantly share code, notes, and snippets.

@jjesusfilho
Last active June 23, 2017 23:06
Show Gist options
  • Save jjesusfilho/cb4a5ec91d6c7dbc23ca095cd054b8c5 to your computer and use it in GitHub Desktop.
Save jjesusfilho/cb4a5ec91d6c7dbc23ca095cd054b8c5 to your computer and use it in GitHub Desktop.
scraper do Tribunal de Justiça do Rio Grande do Sul
library(httr)
library(xml2)
library(stringr)
library(boilerpipeR)
tjrsSG_meta<-function(BuscaLivre="",quote=TRUE){
if(quote==TRUE) BuscaLivre<-deparse(BuscaLivre)
url<-"http://www.tjrs.jus.br/busca/search?"
query<-list(q = "", proxystylesheet = "tjrs_index", getfields = "*",
entsp = "a__politica-site", wc = "200", wc_mc = "1", oe = "UTF-8",
ie = "UTF-8", ud = "1", sort = "date:D:S:d1", as_qj = "",
as_epq = "", as_oq = "", as_eq = "", as_q = "", ulang = "en",
ip = "", access = "p", entqr = "3", entqrm = "0", client = "tjrs_index",
filter = "0", start = "0", aba = "juris", site = "juris")
query[[1]]<-BuscaLivre
query[[11]]<-BuscaLivre
a<-url %>%
GET(query=query) %>%
content("parsed")
num<-a %>% xml_find_all("//*[@class='clearfix left']/div/*[@class='bold'][3]") %>%
xml_text() %>%
as.numeric()
url1<-a %>% xml_find_all("//*[@class='pagination-control']/a[1]/@href") %>%
xml_text() %>%
.[1] %>%
paste0("http://www.tjrs.jus.br/busca/",.)
df<-data.frame()
for (i in seq(0,num,10)){
tryCatch({
url1<-str_replace(url1,"(?<=start\\=)\\d+",as.character(i))
b<-GET(url1) %>%
content("parsed")
processo<-xml_find_all(b,"//*[@class='featured font-size-12']") %>% xml_text()
data.julgamento<-
orgao.julgador<- xml_find_all(b,"//*[@class='larguraUltColuna']") %>%
xml_text(trim=T) %>%
str_replace(".*(:\\s)","")
classe.processual<-xml_find_all(b,"//*[@id='table_resultado']//tr[2]/td[1]") %>%
xml_text(trim=T) %>%
str_replace(".*(:\\s)","")
secao<-xml_find_all(b,"//*[@id='table_resultado']//tr[3]/td[2]") %>%
xml_text(trim=T) %>%
str_replace(".*(:\\s)","")
relator<-xml_find_all(b,"//*[@id='table_resultado']//tr[5]/td[1]") %>%
xml_text(trim=T) %>%
str_replace(".*(:\\s)","")
comarca.origem <-xml_find_all(b,"//*[@id='table_resultado']//tr[2]/td[2]") %>%
xml_text(trim=T) %>%
str_replace(".*(:\\s)","")
classe.material<-xml_find_all(b,"//*[@id='table_resultado']//tr[4]/td[2]") %>%
xml_text(trim=T) %>%
str_replace(".*(:\\s)","")
ementa<-xml_find_all(b,"//*[@class='ementa']") %>%
xml_text(trim=T) %>%
str_replace(".*(:\\s)","")
linkHtml<-xml_find_all(b,"//*[@class='larguraPrimColuna']//a[2]/@href") %>%
xml_text() %>%
str_c("http://www.tjrs.jus.br/busca/",.) %>%
str_replace_all("\\s+","%20")
df1<-data.frame(processo,orgao.julgador,relator, classe.processual,classe.material,secao,comarca.origem,ementa, pagina=i,linkHtml,stringsAsFactors = F)
df<-rbind(df,df1)
}, error=function(m){
m
}, finally={
next
})
}
return(df)
}
inteiroRS<-dfrs$linkHtml %>%
map(function(x){
GET(x) %>%
content("text") %>%
DefaultExtractor()
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment