Skip to content

Instantly share code, notes, and snippets.

@DATAUNIRIO
Last active April 6, 2021 15:12
Show Gist options
  • Save DATAUNIRIO/90fc5142378190a94fada6c98de33c13 to your computer and use it in GitHub Desktop.
Save DATAUNIRIO/90fc5142378190a94fada6c98de33c13 to your computer and use it in GitHub Desktop.
library(robotstxt)
paths_allowed("https://bsi.uniriotec.br/obrigatorias/")
# Robots.txt says okay to scrape
library(rvest)
page <- read_html("https://bsi.uniriotec.br/obrigatorias/")
obrigatorias<- page %>% html_nodes("a") %>%
html_attr("href")
obrigatorias<-obrigatorias[65:99]
page <- read_html("https://bsi.uniriotec.br/optativas/")
optativas<- page %>% html_nodes("a") %>%
html_attr("href")
optativas<-optativas[65:89]
todas<- append(obrigatorias,optativas)
library(rvest)
# para uma pagina
link<-"https://bsi.uniriotec.br/bancos-de-dados-i-tin0120/"
pagina <- read_html(link)
s1 <- pagina %>% rvest::html_nodes("h1") %>% html_text()
s4 <- pagina %>% rvest::html_nodes("h4") %>% html_text()
p <- pagina %>% rvest::html_nodes("p") %>% html_text()
dados<-append(s1,s4)
dados<-append(dados, p)
dados<-dados[1:12]
library(wordcloud)
par(bg="black")
wordcloud(dados,max.words=100,colors=c("white","#eaef88","#e1e85a","#e1e85a"))
wordcloud(dados,min.freq = 1,colors=c("white","#eaef88","#e1e85a","#e1e85a"))
dados_todas<-c()
# para todas as paginas
N <- 25
for(i in 1:N){
aaa <- read_html(obrigatorias[i])
s1[i] <- aaa %>% rvest::html_nodes("h1") %>% html_text()
s4[i] <- aaa %>% rvest::html_nodes("h4") %>% html_text()
p[i] <- aaa %>% rvest::html_nodes("p") %>% html_text()
dados<-append(s1,s4)
dados<-append(dados, p)
dados_todas<-append(dados_todas, dados)
Sys.sleep(2)
cat("\r", i, "de ", N)
}
zz <- file("texto_bsi.txt", "wb")
writeBin( paste(dados_todas, collapse="\n"), zz )
close(zz)
# REGEX
# https://rverbalexpressions.netlify.app/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment