ikashnitsky/twitter-b-census.R

## twitter-b-census.R
#===============================================================================
# 2020-11-21 -- twitter
# B census in replies to the joke tweet
# https://twitter.com/rafaelotinoco/status/1329622292507267073
# Ilya Kashnitsky, ilya.kashnitsky@gmail.com
#===============================================================================

# analyse the first surname letter of those replied to the meme
# the prevalence of simila jokes from B people was so obvious

library(tidyverse)
library(magrittr)
library(stringi)
library(rtweet)
library(ggdark)
library(hrbrthemes)

# for replicability the export is done at this time
##------ Sat Nov 21 19:10:05 2020 ------##
df <- search_tweets("url:1329622292507267073", n = 2000)

save(df, file = "tweets.rda")

# filter quoting tweets
df_quo <- df %>% filter(is_quote == T)

# get the info in the quoting authors names
quo_auth <- df_quo %>% users_data()

# tally first letter of the surnames
df_name <- quo_auth %>%
    select(name) %>%
    mutate(
        name = name %>% str_to_lower() %>%
            stri_enc_toascii() %>%
            # remove text in parentheses
            str_remove("\\s*\\([^\\)]+\\)") %>%
            # remove hasgtags
            str_remove("(?<=^|\\s)#[^\\s]+") %>%
            #remove "jr"
            str_remove(" jr")
    ) %>%
    # remove the common ", phd"
    separate(name, sep = ",", into = c("one", "two")) %>%
    select("one") %>%
    # get the last word in the name string ~surname
    mutate(name = one %>% str_extract("[^ ]+$")) %>%
    # if NA coppy from name
    transmute(surname = case_when(is.na(name)~one, TRUE~name)) %>%
    # GET THE FIRST LETTER OF THE SURNAME
    mutate(first = surname %>% str_sub(1, 1)) %>%
    # calculate letters freq
    group_by(first) %>%
    summarise(n = n()) %>%
    ungroup()

# get the dataset for plotting
df_plot <- tibble(LETTERS) %>%
    mutate(first = LETTERS %>% str_to_lower) %>%
    left_join(df_name) %>%
    mutate(prop = n %>% prop.table())

# visualize
df_plot %>%
    ggplot(aes(LETTERS, prop))+
    geom_col(color = NA, fill= "orange", width = .75)+
    scale_y_percent()+
    labs(
        title = "% of quote tweets",
        x = "First letter of the quote tweet author last name",
        y = NULL
    )+
    dark_theme_minimal(base_family = "Roboto Slab")+
    theme(
        axis.text.x = element_text(
            size = 14, face = 2, colour = c("white", "orange", rep("white", 24))
        )
    )+
    annotate(
        "text", x = 4, y = .3, hjust = 0, vjust = 1,
        color = "orange", size = 7, family = "Roboto Slab",
        label = 'The Twitter census of (B)est jokes'
    )+
    annotate(
        "text", x = 4, y = .23, hjust = 0, vjust = 1,
        color = "white", size = 5, family = "Roboto Slab",
        label = "A good joke is always unexpected, isn't it?"
    )

ggsave(filename = "out.png", width = 6, height = 3.375)
	#===============================================================================
	# 2020-11-21 -- twitter
	# B census in replies to the joke tweet
	# https://twitter.com/rafaelotinoco/status/1329622292507267073
	# Ilya Kashnitsky, ilya.kashnitsky@gmail.com
	#===============================================================================

	# analyse the first surname letter of those replied to the meme
	# the prevalence of simila jokes from B people was so obvious

	library(tidyverse)
	library(magrittr)
	library(stringi)
	library(rtweet)
	library(ggdark)
	library(hrbrthemes)

	# for replicability the export is done at this time
	##------ Sat Nov 21 19:10:05 2020 ------##
	df <- search_tweets("url:1329622292507267073", n = 2000)

	save(df, file = "tweets.rda")

	# filter quoting tweets
	df_quo <- df %>% filter(is_quote == T)

	# get the info in the quoting authors names
	quo_auth <- df_quo %>% users_data()

	# tally first letter of the surnames
	df_name <- quo_auth %>%
	select(name) %>%
	mutate(
	name = name %>% str_to_lower() %>%
	stri_enc_toascii() %>%
	# remove text in parentheses
	str_remove("\\s*\\([^\\)]+\\)") %>%
	# remove hasgtags
	str_remove("(?<=^\|\\s)#[^\\s]+") %>%
	#remove "jr"
	str_remove(" jr")
	) %>%
	# remove the common ", phd"
	separate(name, sep = ",", into = c("one", "two")) %>%
	select("one") %>%
	# get the last word in the name string ~surname
	mutate(name = one %>% str_extract("[^ ]+$")) %>%
	# if NA coppy from name
	transmute(surname = case_when(is.na(name)~one, TRUE~name)) %>%
	# GET THE FIRST LETTER OF THE SURNAME
	mutate(first = surname %>% str_sub(1, 1)) %>%
	# calculate letters freq
	group_by(first) %>%
	summarise(n = n()) %>%
	ungroup()

	# get the dataset for plotting
	df_plot <- tibble(LETTERS) %>%
	mutate(first = LETTERS %>% str_to_lower) %>%
	left_join(df_name) %>%
	mutate(prop = n %>% prop.table())

	# visualize
	df_plot %>%
	ggplot(aes(LETTERS, prop))+
	geom_col(color = NA, fill= "orange", width = .75)+
	scale_y_percent()+
	labs(
	title = "% of quote tweets",
	x = "First letter of the quote tweet author last name",
	y = NULL
	)+
	dark_theme_minimal(base_family = "Roboto Slab")+
	theme(
	axis.text.x = element_text(
	size = 14, face = 2, colour = c("white", "orange", rep("white", 24))
	)
	)+
	annotate(
	"text", x = 4, y = .3, hjust = 0, vjust = 1,
	color = "orange", size = 7, family = "Roboto Slab",
	label = 'The Twitter census of (B)est jokes'
	)+
	annotate(
	"text", x = 4, y = .23, hjust = 0, vjust = 1,
	color = "white", size = 5, family = "Roboto Slab",
	label = "A good joke is always unexpected, isn't it?"
	)

	ggsave(filename = "out.png", width = 6, height = 3.375)