Skip to content

Instantly share code, notes, and snippets.

@lgelape
Last active March 30, 2021 16:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lgelape/01cda13cbb9ecf06e9131873d509910b to your computer and use it in GitHub Desktop.
Save lgelape/01cda13cbb9ecf06e9131873d509910b to your computer and use it in GitHub Desktop.
Código de análise de dados da matéria "De cobras a funk: a guinada na comunicação digital do Instituto Butantan", Núcleo Jornalismo
###################################################################################################
###################################################################################################
#########
######### "De cobras a funk: a guinada na comunicação digital do Instituto Butantan"
#########
######### Lucas Gelape
######### Analise de dados da materia publicada no Nucleo Jornalismo
#########
## Pacotes
library(dplyr)
library(lubridate)
library(stringr)
library(tidytext)
library(tidyr)
library(funprog)
library(purrr)
library(tm)
## Funcoes
# Cria um "nao esta contido em"
'%nin%' <- Negate('%in%')
# Remove strings de dentro de um character
limpar_texto <- function(x){
t <- unlist(strsplit(x, " "))
vetor <- t[t %nin% stopwords]
texto <- paste(vetor, collapse = " ")
}
###################################################################################################
##### TWITTER
# Abre o banco de tweets ate 17/03
twitter <- readRDS("butantan_tweets.rds")
# Acerta fuso horario e cria variaveis para analises e grafico
twitter <- twitter %>%
mutate(created_at = created_at - hours(3),
dia_mes_ano = as.Date(created_at),
mes = month(created_at),
dia = day(created_at)) %>%
mutate(engajamento = retweet_count + favorite_count) %>%
mutate(mes = factor(mes,
levels = c(6:12, 1:3),
labels = c("Jun", "Jul", "Ago",
"Set", "Out", "Nov",
"Dez", "Jan", "Fev",
"Mar")))
### MEDIA TWEETS/DIA
# Calcula a media de tweets por dia (grafico)
tweets_dia <- twitter %>%
group_by(mes) %>%
summarise(dias = length(unique(dia)),
total_mes = n(),
tweets_dia = total_mes/dias) %>%
ungroup()
### ENGAJAMENTO
# Calcula o engajamento mensal de posts do Butantan
engajamento_mes_twitter <- twitter %>%
# Elimina os RT, mantendo somente os posts de autoria do Butantan
filter(!is_retweet == "TRUE") %>%
group_by(mes) %>%
summarise(engajamento_mensal = sum(engajamento)) %>%
ungroup()
# 5 postagens com maior engajamento
twitter %>%
filter(!is_retweet == "TRUE") %>%
slice_max(engajamento, n = 5) %>%
mutate(link = paste0("https://twitter.com/butantanoficial/status/", status_id)) %>%
select(link)
### RESPOSTAS A USUARIOS
# N. absoluto e porcentagem de tweets que sao respostas
replies <- twitter %>%
mutate(resposta = ifelse(!is.na(reply_to_screen_name) & reply_to_screen_name != "butantanoficial",
1, 0)) %>%
group_by(mes) %>%
summarise(resposta = sum(resposta),
percentual = (resposta/n())*100) %>%
ungroup()
### CONTAGEM DE PALAVRAS
# Abrir stopwords gerais
source("https://gist.githubusercontent.com/lgelape/edcc0250f21bcc5710c0a9fd0488d1ea/raw/960e11e214d6a32df5dd38e4d0f251a992b46d57/stopwords_pt.R")
# Stopwords identificadas na analise
stopwords_butantan <- data.frame(word = c("https", "t.co", "butantan", "equipebutantan", "instituto",
"10", "120", "12h45", "19", "2020", "oi", "ola", "ne"),
source = "analise_propria")
# Banco e vetor finais de stopwords
stopwords_pt_final_noaccent <- bind_rows(stopwords_pt_final_noaccent, stopwords_butantan)
stopwords <- stopwords_pt_final_noaccent$word
# Limpa texto dos tweets
twitter <- twitter %>%
mutate(text = tolower(stri_trans_general(text, "Latin-ASCII")),
text = removePunctuation(text))
# Aplica a funcao de remover as stopwords e guarda como vetor
textos_limpos <- map(.x = twitter$text,
.f = limpar_texto)
textos_limpos <- do.call(c, textos_limpos)
# Salva o vetor como coluna do banco de dados
twitter$textos_limpos <- textos_limpos
# Identifica as 5 palavras mais repetidas em cada periodo de analise
palavras_mais_repetidas <- twitter %>%
unnest_tokens(word, textos_limpos) %>%
select(dia_mes_ano, word) %>%
mutate(periodo = ifelse(dia_mes_ano < as.Date("2020-12-01"), 1, 2)) %>%
group_by(periodo) %>%
count(word) %>%
slice_max(n, n = 5) %>%
arrange(desc(n)) %>%
mutate(posicao = rank(n)) %>%
ungroup()
###################################################################################################
##### FACEBOOK
# Abre o banco de posts no Facebook
facebook <- readRDS("butantan_desde0106.rds")
# Cria variaveis uteis na analise
facebook <- facebook %>%
mutate(dia_redondo = as.Date(date),
mes_lubridate = month(date),
dia = day(date)) %>%
mutate(mes = factor(mes_lubridate,
levels = c(6:12, 1:3),
labels = c("Jun", "Jul", "Ago",
"Set", "Out", "Nov",
"Dez", "Jan", "Fev",
"Mar"))) %>%
mutate(engajamento = actual_shareCount + actual_likeCount + actual_loveCount +
actual_commentCount + actual_wowCount + actual_hahaCount +
actual_sadCount + actual_angryCount + actual_thankfulCount +
actual_careCount)
### POSTS/DIA
# Calcula o n. de posts por dia do mes
posts_dia <- facebook %>%
group_by(mes) %>%
summarise(dias = unique(days_in_month(mes_lubridate)),
total_mes = n()) %>%
ungroup() %>%
mutate(dias = ifelse(mes == "Mar", 17, dias),
tweets_dia = total_mes/dias)
###################################################################################################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment