Last active
July 18, 2021 04:28
-
-
Save AugustoCL/af32c55c15d0113f0451f3d79895c5ee to your computer and use it in GitHub Desktop.
Trabalho final de Limpeza de Dados realizado para conclusão do curso de Faxina de Dados.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Imports ----------------------------------------------------------------- | |
library(tidyverse) | |
library(httr) | |
library(rvest) | |
library(janitor) | |
# Função webscraping ------------------------------------------------------ | |
scrapy_consumo_eletr_ano <- function(ano){ | |
# requisicao POST | |
aaaa <- ano | |
url <- "http://dadosenergeticos.energia.sp.gov.br/PortalCEv2/Municipios/Eletricidade/m_eletricidade.asp?ano={aaaa}" | |
req <- httr::POST(url = glue::glue(url), | |
body = list(Ano2 = aaaa), | |
encode = "form") | |
# converte o binario do html para string | |
raw_html <- rawToChar(req$content) %>% | |
str_replace_all(., "Ã\u008d", "Í") %>% | |
str_replace_all(., "ÇÃ", "ÇÃ") %>% | |
str_replace_all(., "Ã", "Ã") %>% | |
str_replace_all(., "É", "É") %>% | |
str_replace_all(., "çã", "çã") | |
# usando regex, extrai o bloco de código equivalente a tabela do html | |
text_inside_html_table <- | |
gsub('^.*<table\\s*|\\s* <\\/table>*$', '', raw_html) %>% | |
paste0("<table ", .) | |
# converte o html da tabela em dataframe | |
dirty_tibble <- text_inside_html_table %>% | |
read_html() %>% | |
html_table() %>% | |
pluck() %>% | |
as.data.frame() %>% | |
as_tibble() | |
} | |
# Função de Limpeza da Base ----------------------------------------------- | |
clean_table <- function(tbl){ | |
tbl %>% | |
slice(-1) %>% | |
janitor::row_to_names(row_number = 1) %>% | |
janitor::clean_names() %>% | |
rename(total_mwh = total_m_wh) %>% | |
separate(col = "municipios", | |
sep = "População", | |
into = c("municipio", "populacao")) %>% | |
separate(col = "municipio", | |
sep = " - ", | |
into = c("ranking", "municipio")) %>% | |
mutate( | |
across(ranking:populacao, | |
~str_squish(.)), | |
across(c(ranking, populacao:total_mwh), | |
~as.numeric(str_replace_all(., ",", ""))) | |
) | |
} | |
# Aplicando as funções com o purrr --------------------------------------------------- | |
all_tbl <- | |
list("anos" = c(2006:2019)) %>% | |
as_tibble() %>% | |
mutate( | |
raw_tbl = purrr::map(anos, | |
scrapy_consumo_eletr_ano), | |
clean_but_not_tidy_tbl = purrr::map(raw_tbl, | |
clean_table), | |
clean_and_tidy_tbl = purrr::map(clean_but_not_tidy_tbl, | |
~unnest(.) %>% | |
pivot_longer(populacao:total_mwh, | |
names_to = "tipo_consumo", | |
values_to = "MWh") | |
) | |
) | |
# Visualizando os passos e o resultado final -------------------------------------- | |
# todo o passo-a-passo em tabelas list-columns | |
all_tbl | |
# dados brutos | |
raw_tbl <- | |
all_tbl %>% | |
select(anos, raw_tbl) %>% | |
unnest(raw_tbl) | |
raw_tbl | |
# dados limpos, mas não tidy | |
clean_but_not_tidy_tbl <- | |
all_tbl %>% | |
select(anos, clean_but_not_tidy_tbl) %>% | |
unnest(clean_but_not_tidy_tbl) | |
clean_but_not_tidy_tbl | |
# dados limpos e tidy | |
clean_and_tidy_tbl <- | |
all_tbl %>% | |
select(anos, clean_and_tidy_tbl) %>% | |
unnest(clean_and_tidy_tbl) | |
clean_and_tidy_tbl | |
write_rds("clean_and_tidy_tbl", clean_and_tidy_tbl) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment