Skip to content

Instantly share code, notes, and snippets.

@lgelape
Created August 18, 2022 20:25
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lgelape/253d4037d04370511f9b48a4f9736cd8 to your computer and use it in GitHub Desktop.
Save lgelape/253d4037d04370511f9b48a4f9736cd8 to your computer and use it in GitHub Desktop.
Obtém handle de redes sociais de candidatos (Twitter, Facebook, Instagram e Youtube) a partir de informações do Repositório de Dados Eleitorais do TSE
################################################################################
################################################################################
##########
########## OBTER HANDLE DE REDES SOCIAIS DE CANDIDATOS
########## A PARTIR DE INFORMACOES DO REPOSITORIO DO TSE
### Lucas Gelape
# Pacotes
library(dplyr)
library(readr)
library(stringr)
library(stringi)
################################################################################
##### BAIXAR E ABRIR DADOS DE 2022
download.file(
"https://cdn.tse.jus.br/estatistica/sead/odsele/consulta_cand/rede_social_candidato_2022.zip",
"redesocial2022.zip")
unzip("redesocial2022.zip",
files = "rede_social_candidato_2022.csv")
file.remove("redesocial2022.zip")
redes2022 <- read_csv2(
"rede_social_candidato_2022.csv",
locale = locale(encoding = "latin1"))
################################################################################
##### IDENTIFICAR PRINCIPAIS REDES SOCIAIS E EXCLUIR DEMAIS LINHAS
# Selecionamos somente Instagram, Facebook, Twitter e Youtube
# Procedimentos semelhantes podem ser usados para outras redes
redes2022 <- redes2022 %>%
# Mantem somente as quatro redes acima
filter(
str_detect(
DS_URL,
paste("instagram", "facebook", "twitter", "youtube", sep = "|"))) %>%
mutate(
# Cria uma variavel que identifica qual a rede
rede = case_when(
str_detect(DS_URL, "facebook") ~ "facebook",
str_detect(DS_URL, "instagram") ~ "instagram",
str_detect(DS_URL, "twitter") ~ "twitter",
str_detect(DS_URL, "youtube") ~ "youtube"),
# Guardar a URL_ORIGINAL pois vamos limpar DS_URL
URL_ORIGINAL = DS_URL,
# Limpeza inicial do texto da URL
DS_URL = str_squish(DS_URL),
DS_URL = tolower(DS_URL),
DS_URL = stri_trans_general(DS_URL, "Latin-ASCII"))
################################################################################
##### LIMPAR TEXTO DESSAS QUATRO REDES
### YOUTUBE
youtube <- redes2022 %>%
filter(rede == "youtube") %>%
mutate(
# Geral
DS_URL = str_replace_all(DS_URL, "www,", "www\\."),
DS_URL = str_replace_all(DS_URL, "/m\\.", "/"),
DS_URL = str_remove_all(DS_URL, "https://"),
DS_URL = str_remove_all(DS_URL, "http://"),
DS_URL = str_remove_all(DS_URL, "www\\."),
# Youtube
DS_URL = ifelse(
str_detect(DS_URL, "watch"),
NA, DS_URL),
DS_URL = str_remove_all(DS_URL, "youtube\\.com/channel/"),
DS_URL = str_remove_all(DS_URL, "youtube\\.com/c/"),
DS_URL = str_remove_all(DS_URL, "youtube\\.com/"),
DS_URL = str_remove_all(DS_URL, "youtube\\.com\\.br/"),
DS_URL = str_remove_all(DS_URL, "youtube\\/"),
DS_URL = str_remove_all(DS_URL, "user/"),
DS_URL = str_remove_all(DS_URL, "/about"),
DS_URL = str_remove_all(DS_URL, "/featured"),
# Remove tudo depois de /?
DS_URL = str_remove_all(DS_URL, "(?=\\/\\?).*"),
# Remove tudo depois de /?
DS_URL = str_remove_all(DS_URL, "(?=\\?).*"),
# Remove tudo depois de /
DS_URL = str_remove_all(DS_URL, "(?=\\/).*"),
# Remove tudo ate "- "
DS_URL = str_remove_all(DS_URL, "^.*?(?<=\\- )"),
# Remove tudo ate ": "
DS_URL = str_remove_all(DS_URL, "^.*?(?<=\\: )"),
)
##### INSTAGRAM
instagram <- redes2022 %>%
filter(rede == "instagram") %>%
mutate(
DS_URL = ifelse(DS_URL == "instagram", NA, DS_URL),
rede = ifelse(DS_URL == "instagram", NA, rede),
# Geral
DS_URL = str_replace_all(DS_URL, "www,", "www\\."),
DS_URL = str_replace_all(DS_URL, "/m\\.", "/"),
DS_URL = str_replace_all(DS_URL, "htpps", "https"),
DS_URL = str_remove_all(DS_URL, "https://"),
DS_URL = str_remove_all(DS_URL, "http://"),
DS_URL = str_remove_all(DS_URL, "www\\."),
DS_URL = str_remove_all(DS_URL, "\\.br"),
# Instagram
DS_URL = ifelse(
str_detect(DS_URL, "instagram\\.com/p/"),
NA, DS_URL),
DS_URL = ifelse(
str_detect(DS_URL, "\\l\\.instagram\\.com"),
NA, DS_URL),
DS_URL = ifelse(
str_detect(DS_URL, "invites"),
NA, DS_URL),
DS_URL = ifelse(
str_detect(DS_URL, "reel"),
NA, DS_URL),
DS_URL = str_remove_all(DS_URL, "instagram\\.com/accounts/login/\\?next=/"),
DS_URL = str_remove_all(DS_URL, "instagram\\.com/account/login/\\?next=/"),
DS_URL = str_remove_all(DS_URL, "instagram\\.com/:"),
DS_URL = str_remove_all(DS_URL, "instagram\\.com/"),
DS_URL = str_remove_all(DS_URL, "instagram/"),
# Remove tudo depois de /?
DS_URL = str_remove_all(DS_URL, "(?=\\/\\?).*"),
# Remove tudo depois de ?
DS_URL = str_remove_all(DS_URL, "(?=\\?).*"),
# Remove tudo depois de /
DS_URL = str_remove_all(DS_URL, "(?=\\/).*"),
# Remove tudo ate "- "
DS_URL = str_remove_all(DS_URL, "^.*?(?<=\\- )"),
# Remove tudo ate ": "
DS_URL = str_remove_all(DS_URL, "^.*?(?<=\\: )"),
# Remove @
DS_URL = str_remove_all(DS_URL, "\\@"),
)
################################################################################
##### FACEBOOK
facebook <- redes2022 %>%
filter(rede == "facebook") %>%
mutate(
DS_URL = ifelse(DS_URL == "facebook", NA, DS_URL),
rede = ifelse(DS_URL == "facebook", NA, rede),
# Geral
DS_URL = str_replace_all(DS_URL, "www,", "www\\."),
DS_URL = str_replace_all(DS_URL, "\\/m\\.", "\\/"),
DS_URL = str_replace_all(DS_URL, "htpps", "https"),
DS_URL = str_replace_all(DS_URL, "httos", "https"),
DS_URL = str_remove_all(DS_URL, "https:\\/\\/"),
DS_URL = str_remove_all(DS_URL, "http:\\/\\/"),
DS_URL = str_remove_all(DS_URL, "https:"),
DS_URL = str_remove_all(DS_URL, "https\\."),
DS_URL = str_remove_all(DS_URL, "https"),
DS_URL = str_remove_all(DS_URL, "www\\."),
DS_URL = str_remove_all(DS_URL, "web\\."),
# Facebook
DS_URL = ifelse(
str_detect(DS_URL, "groups"),
NA, DS_URL),
DS_URL = ifelse(
str_detect(DS_URL, "story\\.php"),
NA, DS_URL),
DS_URL = ifelse(
str_detect(DS_URL, "photo"),
NA, DS_URL),
DS_URL = str_remove_all(DS_URL, "facebook\\.com/people/"),
DS_URL = str_remove_all(DS_URL, "facebook\\.com/search/top\\?q="),
DS_URL = str_remove_all(DS_URL, "facebook\\.com/search/top/\\?q="),
DS_URL = str_remove_all(DS_URL, "facebook\\.com/profile.php\\?id="),
DS_URL = str_remove_all(DS_URL, "facebook\\.com/pg\\/"),
DS_URL = str_remove_all(DS_URL, "pt-br\\.facebook\\.com/"),
DS_URL = str_remove_all(DS_URL, "web\\.facebook\\.com/"),
DS_URL = str_remove_all(DS_URL, "facebook\\.com\\.br/"),
DS_URL = str_remove_all(DS_URL, "facebook\\.com/"),
DS_URL = str_remove_all(DS_URL, "facebook\\.com\\.br"),
DS_URL = str_remove_all(DS_URL, "facebook\\.com"),
DS_URL = str_remove_all(DS_URL, "facebook\\.com\\\\"),
DS_URL = str_remove_all(DS_URL, "www\\.facebook/"),
DS_URL = str_remove_all(DS_URL, "facebook\\/"),
DS_URL = str_remove_all(DS_URL, "- facebook"),
DS_URL = str_remove_all(DS_URL, "facebook:"),
DS_URL = str_remove_all(DS_URL, "facebook\\\\"),
DS_URL = str_remove_all(DS_URL, "facebook@"),
DS_URL = str_remove_all(DS_URL, "facebook "),
# Remove tudo depois de /?
DS_URL = str_remove_all(DS_URL, "(?=\\/\\?).*"),
# Remove tudo depois de ?
DS_URL = str_remove_all(DS_URL, "(?=\\?).*"),
# Remove tudo depois de /
DS_URL = str_remove_all(DS_URL, "(?=\\/).*"),
# Remove tudo ate "- "
DS_URL = str_remove_all(DS_URL, "^.*?(?<=\\- )"),
# Remove tudo ate ": "
DS_URL = str_remove_all(DS_URL, "^.*?(?<=\\: )"),
# Remove tudo depois de &ref
DS_URL = str_remove_all(DS_URL, "(?=&).*"),
# Remove inicios de padroes identificados
DS_URL = str_remove_all(DS_URL, "^:@"),
DS_URL = str_remove_all(DS_URL, "^@"),
DS_URL = str_remove_all(DS_URL, "^:"),
DS_URL = str_remove_all(DS_URL, "^#!")
)
################################################################################
##### TWITTER
twitter <- redes2022 %>%
filter(rede == "twitter") %>%
mutate(
DS_URL = ifelse(DS_URL == "twitter", NA, DS_URL),
rede = ifelse(DS_URL == "twitter", NA, rede),
# Geral
DS_URL = str_replace_all(DS_URL, "www,", "www\\."),
DS_URL = str_replace_all(DS_URL, "\\/m\\.", "\\/"),
DS_URL = str_replace_all(DS_URL, "htpps", "https"),
DS_URL = str_replace_all(DS_URL, "httos", "https"),
DS_URL = str_remove_all(DS_URL, "https:\\/\\/"),
DS_URL = str_remove_all(DS_URL, "http:\\/\\/"),
DS_URL = str_remove_all(DS_URL, "https:"),
DS_URL = str_remove_all(DS_URL, "https\\."),
DS_URL = str_remove_all(DS_URL, "https"),
DS_URL = str_remove_all(DS_URL, "www\\."),
# Twitter
DS_URL = str_remove_all(DS_URL, "mobile\\."),
DS_URL = str_remove_all(DS_URL, "twitter\\.com\\.br/"),
DS_URL = str_remove_all(DS_URL, "twitter\\.com/\\:"),
DS_URL = str_remove_all(DS_URL, "twitter\\.com/"),
DS_URL = str_remove_all(DS_URL, "twitter: "),
# Remove tudo depois de ?
DS_URL = str_remove_all(DS_URL, "(?=\\?).*"),
# Remove tudo depois de /
DS_URL = str_remove_all(DS_URL, "(?=\\/).*"),
# Transforma em NA aqueles que estao como home
DS_URL = ifelse(DS_URL == "home", NA, DS_URL),
# Remove @ e /
DS_URL = str_remove_all(DS_URL, "@"),
DS_URL = str_remove_all(DS_URL, "/"),
)
##### FINAL: QUATRO REDES
redes4_2022 <- bind_rows(instagram, facebook, twitter, youtube) %>%
mutate(
# Remove espacos
DS_URL = str_remove_all(DS_URL, " "),
# Coloca as redes que estao em branco como NA
DS_URL = ifelse(DS_URL == "", NA, DS_URL))
@schoulten
Copy link

schoulten commented Aug 18, 2022

Oi @lgelape , será que httr::parse_url não te ajudaria nesse tratamento? Pelo que entendi você quer obter o id ou user do candidato na rede X e essa funçãozinha faz isso e mais algumas coisas. Já usei ela pra extrair só o hostname, por exemplo, fica como sugestão. Abraço!

@lgelape
Copy link
Author

lgelape commented Aug 23, 2022

Testei rapidamente aqui e ajuda demais, @schoulten. Solução bem mais simples e elegante. Vou tentar ver com calma se passa algo ou se resolve todos os problemas. Obrigado pela sugestão, não conhecia a função!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment