Skip to content

Instantly share code, notes, and snippets.

@AugustoCL
Last active July 18, 2021 04:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AugustoCL/af32c55c15d0113f0451f3d79895c5ee to your computer and use it in GitHub Desktop.
Save AugustoCL/af32c55c15d0113f0451f3d79895c5ee to your computer and use it in GitHub Desktop.
Trabalho final de Limpeza de Dados realizado para conclusão do curso de Faxina de Dados.
# Imports -----------------------------------------------------------------
library(tidyverse)
library(httr)
library(rvest)
library(janitor)
# Função webscraping ------------------------------------------------------
scrapy_consumo_eletr_ano <- function(ano){
# requisicao POST
aaaa <- ano
url <- "http://dadosenergeticos.energia.sp.gov.br/PortalCEv2/Municipios/Eletricidade/m_eletricidade.asp?ano={aaaa}"
req <- httr::POST(url = glue::glue(url),
body = list(Ano2 = aaaa),
encode = "form")
# converte o binario do html para string
raw_html <- rawToChar(req$content) %>%
str_replace_all(., "Ã\u008d", "Í") %>%
str_replace_all(., "ÇÃ", "ÇÃ") %>%
str_replace_all(., "Ã", "Ã") %>%
str_replace_all(., "É", "É") %>%
str_replace_all(., "çã", "çã")
# usando regex, extrai o bloco de código equivalente a tabela do html
text_inside_html_table <-
gsub('^.*<table\\s*|\\s* <\\/table>*$', '', raw_html) %>%
paste0("<table ", .)
# converte o html da tabela em dataframe
dirty_tibble <- text_inside_html_table %>%
read_html() %>%
html_table() %>%
pluck() %>%
as.data.frame() %>%
as_tibble()
}
# Função de Limpeza da Base -----------------------------------------------
clean_table <- function(tbl){
tbl %>%
slice(-1) %>%
janitor::row_to_names(row_number = 1) %>%
janitor::clean_names() %>%
rename(total_mwh = total_m_wh) %>%
separate(col = "municipios",
sep = "População",
into = c("municipio", "populacao")) %>%
separate(col = "municipio",
sep = " - ",
into = c("ranking", "municipio")) %>%
mutate(
across(ranking:populacao,
~str_squish(.)),
across(c(ranking, populacao:total_mwh),
~as.numeric(str_replace_all(., ",", "")))
)
}
# Aplicando as funções com o purrr ---------------------------------------------------
all_tbl <-
list("anos" = c(2006:2019)) %>%
as_tibble() %>%
mutate(
raw_tbl = purrr::map(anos,
scrapy_consumo_eletr_ano),
clean_but_not_tidy_tbl = purrr::map(raw_tbl,
clean_table),
clean_and_tidy_tbl = purrr::map(clean_but_not_tidy_tbl,
~unnest(.) %>%
pivot_longer(populacao:total_mwh,
names_to = "tipo_consumo",
values_to = "MWh")
)
)
# Visualizando os passos e o resultado final --------------------------------------
# todo o passo-a-passo em tabelas list-columns
all_tbl
# dados brutos
raw_tbl <-
all_tbl %>%
select(anos, raw_tbl) %>%
unnest(raw_tbl)
raw_tbl
# dados limpos, mas não tidy
clean_but_not_tidy_tbl <-
all_tbl %>%
select(anos, clean_but_not_tidy_tbl) %>%
unnest(clean_but_not_tidy_tbl)
clean_but_not_tidy_tbl
# dados limpos e tidy
clean_and_tidy_tbl <-
all_tbl %>%
select(anos, clean_and_tidy_tbl) %>%
unnest(clean_and_tidy_tbl)
clean_and_tidy_tbl
write_rds("clean_and_tidy_tbl", clean_and_tidy_tbl)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment