Created
February 9, 2022 21:42
-
-
Save gabrielacaesar/45dcb69ab28ae19281022b636d264615 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### instalando os pacotes (se necessário) | |
# install.packages("tidyverse") | |
# install.packages("googledrive") | |
# install.packages("rvest") | |
### carregando os pacotes | |
library(tidyverse) | |
library(googledrive) | |
library(rvest) | |
### pasta para ABA - MENOS LIDAS | |
dir.create(paste0("abas_menos_lidas_", Sys.Date())) # cria pasta com o dia de hoje | |
setwd(paste0("abas_menos_lidas_", Sys.Date())) # define pasta como local de trabalho | |
getwd() # confirma que você está na pasta como local de trabalho | |
### download de ABA - MENOS LIDAS | |
# informe o seu e-mail abaixo | |
drive_auth(email = "gabriela.caesar@g.globo") # loga com o seu e-mail | |
drive_download(file = as_id("1f0A61KyZJ5-17PG2iMdsPqQrdFoxmSV7NxCaasv9NJM"), type = "csv") # baixa a aba | |
### leitura do arquivo | |
dados <- read_csv(list.files()) | |
### define url para a raspagem | |
all_urls <- dados %>% | |
select(url) %>% | |
mutate(url = paste0("https://", url)) | |
### coleta data + hora das urls | |
scraped_data_hora <- all_urls$url[1] %>% | |
rvest::read_html() %>% | |
rvest::html_nodes("p.content-publication-data__updated") %>% | |
rvest::html_text() %>% | |
as.data.frame() %>% | |
rename(content = ".") %>% | |
mutate(content = str_trim(content), | |
url = all_urls$url[1]) %>% | |
separate(content, c("data", "hora"), sep = "\\s") | |
### coleta da data + hora | |
get_data_hora <- function(i){ | |
all_urls$url[i] %>% | |
rvest::read_html() %>% | |
rvest::html_nodes("p.content-publication-data__updated") %>% | |
rvest::html_text() %>% | |
as.data.frame() %>% | |
rename(content = ".") %>% | |
mutate(content = str_trim(content), | |
url = all_urls$url[i]) %>% | |
separate(content, c("data", "hora"), sep = "\\s") | |
} | |
scraped_data_hora <- map_dfr(1:length(all_urls$url), get_data_hora) | |
### coleta do autor | |
get_autor <- function(i) { | |
all_urls$url[i] %>% | |
rvest::read_html() %>% | |
rvest::html_nodes("p.content-publication-data__from") %>% | |
rvest::html_text() %>% | |
as.data.frame() %>% | |
rename(autor = ".") %>% | |
mutate(autor = str_trim(autor), | |
url = all_urls$url[i]) | |
} | |
scraped_autor <- map_dfr(1:length(all_urls$url), get_autor) | |
final_scraped <- dados %>% | |
mutate(url = paste0("https://", url)) %>% | |
left_join(scraped_data_hora, by = "url") %>% | |
left_join(scraped_autor, by = "url") %>% | |
mutate(data = lubridate::dmy(data), | |
dia_semana = weekdays(data)) | |
writexl::write_xlsx(final_scraped, paste0("final_scraped_", Sys.Date(), ".xlsx")) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment