Skip to content

Instantly share code, notes, and snippets.

@gabrielacaesar
Created February 9, 2022 21:42
Show Gist options
  • Save gabrielacaesar/45dcb69ab28ae19281022b636d264615 to your computer and use it in GitHub Desktop.
Save gabrielacaesar/45dcb69ab28ae19281022b636d264615 to your computer and use it in GitHub Desktop.
### instalando os pacotes (se necessário)
# install.packages("tidyverse")
# install.packages("googledrive")
# install.packages("rvest")
### carregando os pacotes
library(tidyverse)
library(googledrive)
library(rvest)
### pasta para ABA - MENOS LIDAS
dir.create(paste0("abas_menos_lidas_", Sys.Date())) # cria pasta com o dia de hoje
setwd(paste0("abas_menos_lidas_", Sys.Date())) # define pasta como local de trabalho
getwd() # confirma que você está na pasta como local de trabalho
### download de ABA - MENOS LIDAS
# informe o seu e-mail abaixo
drive_auth(email = "gabriela.caesar@g.globo") # loga com o seu e-mail
drive_download(file = as_id("1f0A61KyZJ5-17PG2iMdsPqQrdFoxmSV7NxCaasv9NJM"), type = "csv") # baixa a aba
### leitura do arquivo
dados <- read_csv(list.files())
### define url para a raspagem
all_urls <- dados %>%
select(url) %>%
mutate(url = paste0("https://", url))
### coleta data + hora das urls
scraped_data_hora <- all_urls$url[1] %>%
rvest::read_html() %>%
rvest::html_nodes("p.content-publication-data__updated") %>%
rvest::html_text() %>%
as.data.frame() %>%
rename(content = ".") %>%
mutate(content = str_trim(content),
url = all_urls$url[1]) %>%
separate(content, c("data", "hora"), sep = "\\s")
### coleta da data + hora
get_data_hora <- function(i){
all_urls$url[i] %>%
rvest::read_html() %>%
rvest::html_nodes("p.content-publication-data__updated") %>%
rvest::html_text() %>%
as.data.frame() %>%
rename(content = ".") %>%
mutate(content = str_trim(content),
url = all_urls$url[i]) %>%
separate(content, c("data", "hora"), sep = "\\s")
}
scraped_data_hora <- map_dfr(1:length(all_urls$url), get_data_hora)
### coleta do autor
get_autor <- function(i) {
all_urls$url[i] %>%
rvest::read_html() %>%
rvest::html_nodes("p.content-publication-data__from") %>%
rvest::html_text() %>%
as.data.frame() %>%
rename(autor = ".") %>%
mutate(autor = str_trim(autor),
url = all_urls$url[i])
}
scraped_autor <- map_dfr(1:length(all_urls$url), get_autor)
final_scraped <- dados %>%
mutate(url = paste0("https://", url)) %>%
left_join(scraped_data_hora, by = "url") %>%
left_join(scraped_autor, by = "url") %>%
mutate(data = lubridate::dmy(data),
dia_semana = weekdays(data))
writexl::write_xlsx(final_scraped, paste0("final_scraped_", Sys.Date(), ".xlsx"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment