Created
March 10, 2021 21:56
-
-
Save lgelape/d854f7f23a900531e3fd4977d574e492 to your computer and use it in GitHub Desktop.
Código de análise da matéria "Em meio ao caos, cientistas brasileiros florescem no Twitter", Núcleo Jornalismo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################################################### | |
##### | |
##### ENGAJAMENTO DE PERFIS NO SCIENCE PULSE AO LONGO DA COBERTURA | |
##### | |
##### | |
# Pacotes | |
library(lubridate) | |
library(stringr) | |
library(tidyr) | |
library(dplyr) | |
library(ggplot2) | |
library(zoo) | |
library(forcats) | |
################################################################################################### | |
# Abre a base de tweets e perfis | |
perfis <- readRDS("perfis_022021.rds") | |
base <- readRDS("sciencepulse_ate09032021.rds") %>% | |
select(-c(categoria, person_institution)) | |
# Cria variaveis de interesse e filtra pelo periodo da analise | |
base <- base %>% | |
select(-c(followers_count, screen_name)) %>% | |
# Transforma a variavel created_at em data | |
# (a primeira linha e pra evitar um problema na leitura de POSICXt) | |
mutate(created_at = substr(created_at, 1, 10), | |
created_at = ymd(created_at)) %>% | |
# Filtra entre junho e janeiro | |
filter(created_at > ymd("2020-05-31") & created_at < ymd("2021-03-08")) %>% | |
# Cria uma variavel semana_ano em formato character | |
mutate(semana = week(created_at), | |
ano = year(created_at), | |
semana = str_pad(semana, 2, "left", "0"), | |
semana_ano = paste0(ano, semana)) %>% | |
# Engajamento de cada tweet | |
mutate(engajamento = retweet_count + favorite_count, | |
engajamento_w = retweet_count + (0.166666*favorite_count)) %>% | |
left_join(perfis, by = "user_id") | |
# Cria banco que sinaliza o dia inicial de cada semana | |
semana_dia <- select(base, created_at, semana_ano) %>% | |
distinct() %>% | |
arrange(created_at) %>% | |
group_by(semana_ano) %>% | |
slice_min(created_at, n = 1) %>% | |
ungroup() %>% | |
rename(dia_inicial = created_at) | |
# Incorpora a informacao do dia inicial de cada semana | |
base <- left_join(base, semana_dia) | |
# Vetor com o screen_name dos perfis brasileiros | |
perfis_brasileiros <- c("AndersonBrito_", "dadourado", "marciacastrorj", "MBittencourtMD", "mellziland", "oatila", | |
"otavio_ranzani", "PauloLotufo", "TaschnerNatalia", "ThomasVConti", | |
"dogarrett", "luizacaires3") | |
################################################################################################### | |
### ANALISE DO ENGAJAMENTO DOS PRINCIPAIS DIVULGADORES - SP/IBPAD | |
# Calcula o engajamento de cada perfil por semana | |
engajamento_semana <- base %>% | |
filter(is_retweet == "FALSE") %>% | |
group_by(user_id, screen_name, categoria, semana_ano, dia_inicial) %>% | |
mutate(n_tweets = n()) %>% | |
summarise(taxa = (sum(retweet_count) + sum(favorite_count))/unique(n_tweets), | |
n_tweets = unique(n_tweets), | |
engajamento = sum(retweet_count) + sum(favorite_count)) %>% | |
ungroup() | |
################################################################################################### | |
## Total de engajamento da primeira e ultima semana | |
# Calcula o total de engajamento das 12 contas brasileiras | |
# Primeira semana que cada um esteve na base | |
primeira_semana <- engajamento_semana %>% | |
filter(screen_name %in% perfis_brasileiros) %>% | |
group_by(user_id) %>% | |
slice_min(semana_ano, n = 1) %>% | |
ungroup() %>% | |
mutate(tipo = 1) | |
# Ultima semana de fevereiro | |
ultima_semana <- engajamento_semana %>% | |
filter(screen_name %in% perfis_brasileiros, | |
dia_inicial == ymd("2021-02-26")) | |
# Empilha os dois bancos | |
semana_inicio_fim <- bind_rows(primeira_semana, ultima_semana) %>% | |
group_by(tipo) %>% | |
mutate(taxa = sum(engajamento)/sum(n_tweets), | |
engajamento_p = 100*(engajamento/sum(engajamento))) %>% | |
ungroup() | |
################################################################################################### | |
## Engajamento total dessas contas por semana (5+) | |
# Calcula o agregado de engajamento das 12 contas brasileiras, | |
# a partir da semana em que todas estavam na base | |
total_semana_brasileiros <- engajamento_semana %>% | |
filter(screen_name %in% perfis_brasileiros, | |
semana_ano > "202029") %>% | |
group_by(semana_ano) %>% | |
summarise(taxa = sum(engajamento)/sum(n_tweets), | |
engajamento = sum(engajamento)) %>% | |
ungroup() %>% | |
slice_max(engajamento, n = 5) | |
################################################################################################### | |
## Calcula a taxa e tendencia de engajamento diario de cada perfil | |
# Somente a partir do dia em que todos os perfis estavam na base | |
total_dia_brasileiros <- base %>% | |
filter(is_retweet == "FALSE") %>% | |
filter(screen_name %in% perfis_brasileiros, | |
created_at > as.Date("2020-07-26")) %>% | |
group_by(created_at) %>% | |
mutate(n_tweets = n()) %>% | |
summarise(taxa = sum(engajamento)/unique(n_tweets), | |
total = sum(engajamento)) %>% | |
ungroup() %>% | |
mutate(media_15d = zoo::rollmean(taxa, k = 15, fill = NA, align = "right")) | |
# Primeiro grafico da materia | |
total_dia_brasileiros %>% | |
ggplot() + | |
geom_point(aes(x = created_at, y = media_15d), size = 0.1) + | |
geom_line(aes(x = created_at, y = media_15d)) + | |
geom_smooth(aes(x = created_at, y = media_15d), method = "lm", color = "red", se = F) + | |
geom_smooth(aes(x = created_at, y = media_15d), method = "loess", se = F) + | |
theme_light() + | |
theme(axis.text = element_text(size = 6)) + | |
scale_x_date(breaks = c(as.Date("2020-08-01"), | |
as.Date("2020-09-01"), | |
as.Date("2020-10-01"), | |
as.Date("2020-11-01"), | |
as.Date("2020-12-01"), | |
as.Date("2021-01-01"), | |
as.Date("2021-02-01"), | |
as.Date("2021-03-01")), | |
labels = c("Ago", "Set", | |
"Out", "Nov", "Dez", | |
"Jan", "Fev", "Mar")) + | |
labs(title = "Tendência de Engajamento diário (média móvel, 15 dias)", | |
x = "", y = "\nEngajamento") | |
################################################################################################### | |
## Calcula a tendencia de engajamento para os 12 perfis individualmente | |
# Cria tabela base pra todas as combinacoes de dia-perfil | |
# (para preencher quando usuarios nao tuitam) | |
tabela_base_dia <- base %>% | |
select(created_at, user_id) %>% | |
distinct() %>% | |
complete(created_at, user_id) | |
# Calcula estatisticas sobre engajamento de cada perfil por dia | |
engajamento_dia <- base %>% | |
filter(is_retweet == "FALSE") %>% | |
group_by(user_id, created_at) %>% | |
# Conta o n. de tweets/dia do usuario | |
mutate(n_tweets = n()) %>% | |
# Calcula a taxa de engajamento e o total diario | |
summarise(taxa = sum(engajamento)/unique(n_tweets), | |
total = sum(engajamento)) %>% | |
ungroup() %>% | |
# Junta a tabela base para ter info de quando nao tuitaram | |
right_join(tabela_base_dia) %>% | |
group_by(user_id) %>% | |
# Calcula a media movel de 15 dias so dentro de cada usuario | |
mutate(media_15d = zoo::rollmean(taxa, k = 15, fill = NA, align = "right")) %>% | |
ungroup() %>% | |
left_join(perfis) | |
# Grafico base para segunda imagem da materia | |
engajamento_dia %>% | |
filter(screen_name %in% perfis_brasileiros) %>% | |
ggplot() + | |
geom_point(aes(x = created_at, y = media_15d), size = 0.1) + | |
geom_line(aes(x = created_at, y = media_15d)) + | |
geom_smooth(aes(x = created_at, y = media_15d), method = "lm", color = "red", se = F) + | |
geom_smooth(aes(x = created_at, y = media_15d), method = "loess", se = F) + | |
facet_wrap(~ screen_name, nrow = 4, scales = "free") + | |
theme_light() + | |
theme(axis.text = element_text(size = 6)) + | |
scale_x_date(breaks = c(as.Date("2020-07-01"), | |
as.Date("2020-08-01"), | |
as.Date("2020-09-01"), | |
as.Date("2020-10-01"), | |
as.Date("2020-11-01"), | |
as.Date("2020-12-01"), | |
as.Date("2021-01-01"), | |
as.Date("2021-02-01"), | |
as.Date("2021-03-01")), | |
labels = c("Jul", "Ago", "Set", | |
"Out", "Nov", "Dez", | |
"Jan", "Fev", "Mar")) + | |
labs(title = "Tendência de engajamento diário (média móvel 15 dias)", | |
x = "", y = "\nEngajamento") | |
################################################################################################### | |
## Pluralizacao e dados do Atila | |
# Calcula o percentual de engajamento de cada perfil | |
# em relacao ao total de cada semana, considerando somente | |
# semanas em que estavam todas as contas. | |
semana_brasileiros <- engajamento_semana %>% | |
filter(screen_name %in% perfis_brasileiros, | |
semana_ano > "202029") %>% | |
group_by(semana_ano) %>% | |
mutate(taxa = sum(engajamento)/sum(n_tweets), | |
engajamento_p = 100*(engajamento/sum(engajamento))) %>% | |
ungroup() | |
# Grafico base da terceira figura da materia | |
semana_brasileiros %>% | |
mutate(screen_name = factor(screen_name, | |
levels = c("oatila", "luizacaires3", "otavio_ranzani", | |
"MBittencourtMD", "mellziland", "ThomasVConti", | |
"dadourado", "TaschnerNatalia", "dogarrett", | |
"PauloLotufo", "marciacastrorj", "AndersonBrito_")), | |
screen_name = fct_rev(screen_name)) %>% | |
ggplot() + | |
geom_col(aes(x = semana_ano, y = engajamento_p, fill = screen_name)) + | |
theme_light() + | |
theme(axis.text.x = element_blank(), | |
legend.position = "bottom", | |
legend.direction = "horizontal") + | |
scale_fill_grey(start = 0.8, end = 0.2) + | |
labs(title = "Volume de engajamento entre as principais contas", | |
x = "\nSemanas", y = "\nEngajamento (em %)") | |
# Perfis com 20 semanas com mais engajamento | |
semana_brasileiros %>% | |
slice_max(engajamento, n = 20) %>% | |
group_by(screen_name) %>% | |
summarise(numero = n()) %>% | |
ungroup() | |
# Semanas com menor engajamento do atila | |
semana_brasileiros %>% | |
filter(screen_name == "oatila") %>% | |
slice_min(engajamento_p, n = 5) | |
# Ranking geral desses perfis pela taxa de engajamento | |
semana_brasileiros %>% | |
group_by(screen_name) %>% | |
summarise(soma_engajamento = sum(engajamento), | |
soma_tweets = sum(n_tweets)) %>% | |
ungroup() %>% | |
mutate(taxa = soma_engajamento/soma_tweets) %>% | |
arrange(desc(taxa)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment