Created
August 18, 2022 20:25
-
-
Save lgelape/253d4037d04370511f9b48a4f9736cd8 to your computer and use it in GitHub Desktop.
Obtém handle de redes sociais de candidatos (Twitter, Facebook, Instagram e Youtube) a partir de informações do Repositório de Dados Eleitorais do TSE
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################################ | |
################################################################################ | |
########## | |
########## OBTER HANDLE DE REDES SOCIAIS DE CANDIDATOS | |
########## A PARTIR DE INFORMACOES DO REPOSITORIO DO TSE | |
### Lucas Gelape | |
# Pacotes | |
library(dplyr) | |
library(readr) | |
library(stringr) | |
library(stringi) | |
################################################################################ | |
##### BAIXAR E ABRIR DADOS DE 2022 | |
download.file( | |
"https://cdn.tse.jus.br/estatistica/sead/odsele/consulta_cand/rede_social_candidato_2022.zip", | |
"redesocial2022.zip") | |
unzip("redesocial2022.zip", | |
files = "rede_social_candidato_2022.csv") | |
file.remove("redesocial2022.zip") | |
redes2022 <- read_csv2( | |
"rede_social_candidato_2022.csv", | |
locale = locale(encoding = "latin1")) | |
################################################################################ | |
##### IDENTIFICAR PRINCIPAIS REDES SOCIAIS E EXCLUIR DEMAIS LINHAS | |
# Selecionamos somente Instagram, Facebook, Twitter e Youtube | |
# Procedimentos semelhantes podem ser usados para outras redes | |
redes2022 <- redes2022 %>% | |
# Mantem somente as quatro redes acima | |
filter( | |
str_detect( | |
DS_URL, | |
paste("instagram", "facebook", "twitter", "youtube", sep = "|"))) %>% | |
mutate( | |
# Cria uma variavel que identifica qual a rede | |
rede = case_when( | |
str_detect(DS_URL, "facebook") ~ "facebook", | |
str_detect(DS_URL, "instagram") ~ "instagram", | |
str_detect(DS_URL, "twitter") ~ "twitter", | |
str_detect(DS_URL, "youtube") ~ "youtube"), | |
# Guardar a URL_ORIGINAL pois vamos limpar DS_URL | |
URL_ORIGINAL = DS_URL, | |
# Limpeza inicial do texto da URL | |
DS_URL = str_squish(DS_URL), | |
DS_URL = tolower(DS_URL), | |
DS_URL = stri_trans_general(DS_URL, "Latin-ASCII")) | |
################################################################################ | |
##### LIMPAR TEXTO DESSAS QUATRO REDES | |
### YOUTUBE | |
youtube <- redes2022 %>% | |
filter(rede == "youtube") %>% | |
mutate( | |
# Geral | |
DS_URL = str_replace_all(DS_URL, "www,", "www\\."), | |
DS_URL = str_replace_all(DS_URL, "/m\\.", "/"), | |
DS_URL = str_remove_all(DS_URL, "https://"), | |
DS_URL = str_remove_all(DS_URL, "http://"), | |
DS_URL = str_remove_all(DS_URL, "www\\."), | |
# Youtube | |
DS_URL = ifelse( | |
str_detect(DS_URL, "watch"), | |
NA, DS_URL), | |
DS_URL = str_remove_all(DS_URL, "youtube\\.com/channel/"), | |
DS_URL = str_remove_all(DS_URL, "youtube\\.com/c/"), | |
DS_URL = str_remove_all(DS_URL, "youtube\\.com/"), | |
DS_URL = str_remove_all(DS_URL, "youtube\\.com\\.br/"), | |
DS_URL = str_remove_all(DS_URL, "youtube\\/"), | |
DS_URL = str_remove_all(DS_URL, "user/"), | |
DS_URL = str_remove_all(DS_URL, "/about"), | |
DS_URL = str_remove_all(DS_URL, "/featured"), | |
# Remove tudo depois de /? | |
DS_URL = str_remove_all(DS_URL, "(?=\\/\\?).*"), | |
# Remove tudo depois de /? | |
DS_URL = str_remove_all(DS_URL, "(?=\\?).*"), | |
# Remove tudo depois de / | |
DS_URL = str_remove_all(DS_URL, "(?=\\/).*"), | |
# Remove tudo ate "- " | |
DS_URL = str_remove_all(DS_URL, "^.*?(?<=\\- )"), | |
# Remove tudo ate ": " | |
DS_URL = str_remove_all(DS_URL, "^.*?(?<=\\: )"), | |
) | |
instagram <- redes2022 %>% | |
filter(rede == "instagram") %>% | |
mutate( | |
DS_URL = ifelse(DS_URL == "instagram", NA, DS_URL), | |
rede = ifelse(DS_URL == "instagram", NA, rede), | |
# Geral | |
DS_URL = str_replace_all(DS_URL, "www,", "www\\."), | |
DS_URL = str_replace_all(DS_URL, "/m\\.", "/"), | |
DS_URL = str_replace_all(DS_URL, "htpps", "https"), | |
DS_URL = str_remove_all(DS_URL, "https://"), | |
DS_URL = str_remove_all(DS_URL, "http://"), | |
DS_URL = str_remove_all(DS_URL, "www\\."), | |
DS_URL = str_remove_all(DS_URL, "\\.br"), | |
DS_URL = ifelse( | |
str_detect(DS_URL, "instagram\\.com/p/"), | |
NA, DS_URL), | |
DS_URL = ifelse( | |
str_detect(DS_URL, "\\l\\.instagram\\.com"), | |
NA, DS_URL), | |
DS_URL = ifelse( | |
str_detect(DS_URL, "invites"), | |
NA, DS_URL), | |
DS_URL = ifelse( | |
str_detect(DS_URL, "reel"), | |
NA, DS_URL), | |
DS_URL = str_remove_all(DS_URL, "instagram\\.com/accounts/login/\\?next=/"), | |
DS_URL = str_remove_all(DS_URL, "instagram\\.com/account/login/\\?next=/"), | |
DS_URL = str_remove_all(DS_URL, "instagram\\.com/:"), | |
DS_URL = str_remove_all(DS_URL, "instagram\\.com/"), | |
DS_URL = str_remove_all(DS_URL, "instagram/"), | |
# Remove tudo depois de /? | |
DS_URL = str_remove_all(DS_URL, "(?=\\/\\?).*"), | |
# Remove tudo depois de ? | |
DS_URL = str_remove_all(DS_URL, "(?=\\?).*"), | |
# Remove tudo depois de / | |
DS_URL = str_remove_all(DS_URL, "(?=\\/).*"), | |
# Remove tudo ate "- " | |
DS_URL = str_remove_all(DS_URL, "^.*?(?<=\\- )"), | |
# Remove tudo ate ": " | |
DS_URL = str_remove_all(DS_URL, "^.*?(?<=\\: )"), | |
# Remove @ | |
DS_URL = str_remove_all(DS_URL, "\\@"), | |
) | |
################################################################################ | |
facebook <- redes2022 %>% | |
filter(rede == "facebook") %>% | |
mutate( | |
DS_URL = ifelse(DS_URL == "facebook", NA, DS_URL), | |
rede = ifelse(DS_URL == "facebook", NA, rede), | |
# Geral | |
DS_URL = str_replace_all(DS_URL, "www,", "www\\."), | |
DS_URL = str_replace_all(DS_URL, "\\/m\\.", "\\/"), | |
DS_URL = str_replace_all(DS_URL, "htpps", "https"), | |
DS_URL = str_replace_all(DS_URL, "httos", "https"), | |
DS_URL = str_remove_all(DS_URL, "https:\\/\\/"), | |
DS_URL = str_remove_all(DS_URL, "http:\\/\\/"), | |
DS_URL = str_remove_all(DS_URL, "https:"), | |
DS_URL = str_remove_all(DS_URL, "https\\."), | |
DS_URL = str_remove_all(DS_URL, "https"), | |
DS_URL = str_remove_all(DS_URL, "www\\."), | |
DS_URL = str_remove_all(DS_URL, "web\\."), | |
DS_URL = ifelse( | |
str_detect(DS_URL, "groups"), | |
NA, DS_URL), | |
DS_URL = ifelse( | |
str_detect(DS_URL, "story\\.php"), | |
NA, DS_URL), | |
DS_URL = ifelse( | |
str_detect(DS_URL, "photo"), | |
NA, DS_URL), | |
DS_URL = str_remove_all(DS_URL, "facebook\\.com/people/"), | |
DS_URL = str_remove_all(DS_URL, "facebook\\.com/search/top\\?q="), | |
DS_URL = str_remove_all(DS_URL, "facebook\\.com/search/top/\\?q="), | |
DS_URL = str_remove_all(DS_URL, "facebook\\.com/profile.php\\?id="), | |
DS_URL = str_remove_all(DS_URL, "facebook\\.com/pg\\/"), | |
DS_URL = str_remove_all(DS_URL, "pt-br\\.facebook\\.com/"), | |
DS_URL = str_remove_all(DS_URL, "web\\.facebook\\.com/"), | |
DS_URL = str_remove_all(DS_URL, "facebook\\.com\\.br/"), | |
DS_URL = str_remove_all(DS_URL, "facebook\\.com/"), | |
DS_URL = str_remove_all(DS_URL, "facebook\\.com\\.br"), | |
DS_URL = str_remove_all(DS_URL, "facebook\\.com"), | |
DS_URL = str_remove_all(DS_URL, "facebook\\.com\\\\"), | |
DS_URL = str_remove_all(DS_URL, "www\\.facebook/"), | |
DS_URL = str_remove_all(DS_URL, "facebook\\/"), | |
DS_URL = str_remove_all(DS_URL, "- facebook"), | |
DS_URL = str_remove_all(DS_URL, "facebook:"), | |
DS_URL = str_remove_all(DS_URL, "facebook\\\\"), | |
DS_URL = str_remove_all(DS_URL, "facebook@"), | |
DS_URL = str_remove_all(DS_URL, "facebook "), | |
# Remove tudo depois de /? | |
DS_URL = str_remove_all(DS_URL, "(?=\\/\\?).*"), | |
# Remove tudo depois de ? | |
DS_URL = str_remove_all(DS_URL, "(?=\\?).*"), | |
# Remove tudo depois de / | |
DS_URL = str_remove_all(DS_URL, "(?=\\/).*"), | |
# Remove tudo ate "- " | |
DS_URL = str_remove_all(DS_URL, "^.*?(?<=\\- )"), | |
# Remove tudo ate ": " | |
DS_URL = str_remove_all(DS_URL, "^.*?(?<=\\: )"), | |
# Remove tudo depois de &ref | |
DS_URL = str_remove_all(DS_URL, "(?=&).*"), | |
# Remove inicios de padroes identificados | |
DS_URL = str_remove_all(DS_URL, "^:@"), | |
DS_URL = str_remove_all(DS_URL, "^@"), | |
DS_URL = str_remove_all(DS_URL, "^:"), | |
DS_URL = str_remove_all(DS_URL, "^#!") | |
) | |
################################################################################ | |
twitter <- redes2022 %>% | |
filter(rede == "twitter") %>% | |
mutate( | |
DS_URL = ifelse(DS_URL == "twitter", NA, DS_URL), | |
rede = ifelse(DS_URL == "twitter", NA, rede), | |
# Geral | |
DS_URL = str_replace_all(DS_URL, "www,", "www\\."), | |
DS_URL = str_replace_all(DS_URL, "\\/m\\.", "\\/"), | |
DS_URL = str_replace_all(DS_URL, "htpps", "https"), | |
DS_URL = str_replace_all(DS_URL, "httos", "https"), | |
DS_URL = str_remove_all(DS_URL, "https:\\/\\/"), | |
DS_URL = str_remove_all(DS_URL, "http:\\/\\/"), | |
DS_URL = str_remove_all(DS_URL, "https:"), | |
DS_URL = str_remove_all(DS_URL, "https\\."), | |
DS_URL = str_remove_all(DS_URL, "https"), | |
DS_URL = str_remove_all(DS_URL, "www\\."), | |
DS_URL = str_remove_all(DS_URL, "mobile\\."), | |
DS_URL = str_remove_all(DS_URL, "twitter\\.com\\.br/"), | |
DS_URL = str_remove_all(DS_URL, "twitter\\.com/\\:"), | |
DS_URL = str_remove_all(DS_URL, "twitter\\.com/"), | |
DS_URL = str_remove_all(DS_URL, "twitter: "), | |
# Remove tudo depois de ? | |
DS_URL = str_remove_all(DS_URL, "(?=\\?).*"), | |
# Remove tudo depois de / | |
DS_URL = str_remove_all(DS_URL, "(?=\\/).*"), | |
# Transforma em NA aqueles que estao como home | |
DS_URL = ifelse(DS_URL == "home", NA, DS_URL), | |
# Remove @ e / | |
DS_URL = str_remove_all(DS_URL, "@"), | |
DS_URL = str_remove_all(DS_URL, "/"), | |
) | |
##### FINAL: QUATRO REDES | |
redes4_2022 <- bind_rows(instagram, facebook, twitter, youtube) %>% | |
mutate( | |
# Remove espacos | |
DS_URL = str_remove_all(DS_URL, " "), | |
# Coloca as redes que estao em branco como NA | |
DS_URL = ifelse(DS_URL == "", NA, DS_URL)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Testei rapidamente aqui e ajuda demais, @schoulten. Solução bem mais simples e elegante. Vou tentar ver com calma se passa algo ou se resolve todos os problemas. Obrigado pela sugestão, não conhecia a função!