makes some counts of tweets and words from an archive downloaded from Twitter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# analyse my first 10000 tweets | |
# based on https://juliasilge.com/blog/ten-thousand-tweets/ | |
library(ggplot2) | |
library(lubridate) | |
library(dplyr) | |
library(cowplot) | |
# read my tweet archive that I downloaded from Twitter | |
d <- read.csv("data/tweets.csv", | |
header = TRUE) | |
d$datetime <- ymd_hms(as.character(d$timestamp)) | |
d$datetime <- with_tz(d$datetime, tz = "Asia/Bangkok") | |
d$year <- year(d$datetime) | |
d$month <- month(d$datetime) | |
d$day <- day(d$datetime) | |
d$hour <- hour(d$datetime) | |
d$weekday <- wday(d$datetime, week_start = 1, | |
label = TRUE) | |
d$date <- ymd(paste(d$year, d$month, d$day)) | |
# order these | |
d <- d[with(d, order(tweet_id)), ] | |
# select the first 10,000 tweets | |
d <- d[1:1e4, ] | |
# total by time | |
p <- ggplot(data = d, aes(x = datetime)) | |
total <- p + background_grid(major = "xy") + | |
scale_x_datetime(date_breaks = "1 year", date_labels = "%Y") + | |
geom_histogram(colour = "skyblue2", fill = "skyblue1") + | |
labs(x = "Time", | |
y = "Number of tweets", | |
caption = "first 10,000 tweets from @asianturfgrass") | |
total | |
save_plot("~/Desktop/total_tweets.png", total, base_aspect_ratio = 1.78) | |
by_year <- d %>% | |
group_by(year) %>% | |
summarise(total = length(text)) | |
# total by year | |
p <- ggplot(data = d, aes(x = year)) | |
annual <- p + background_grid(major = "xy") + | |
scale_x_continuous(breaks = 2011:2018) + | |
geom_histogram(colour = "skyblue2", fill = "skyblue1", | |
stat = "count") + | |
labs(x = "Year", | |
y = "Number of tweets", | |
caption = "first 10,000 tweets from @asianturfgrass") | |
annual | |
save_plot("~/Desktop/annual_tweets.png", annual, base_aspect_ratio = 1.78) | |
# total by weekday | |
p <- ggplot(data = d, aes(x = weekday)) | |
week <- p + background_grid(major = "xy") + | |
# scale_x_continuous(breaks = 2011:2018) + | |
geom_histogram(colour = "skyblue2", fill = "skyblue1", | |
stat = "count") + | |
labs(x = "Day of week", | |
y = "Number of tweets", | |
caption = "first 10,000 tweets from @asianturfgrass") | |
week | |
save_plot("~/Desktop/week_tweets.png", week, base_aspect_ratio = 1.78) | |
# total by month | |
p <- ggplot(data = d, aes(x = month(datetime, label = TRUE))) | |
month <- p + background_grid(major = "xy") + | |
# scale_x_continuous(breaks = 1:12) + | |
geom_histogram(colour = "skyblue2", fill = "skyblue1", | |
stat = "count") + | |
labs(x = "Month", | |
y = "Number of tweets", | |
caption = "first 10,000 tweets from @asianturfgrass") | |
month | |
save_plot("~/Desktop/month_tweets.png", month, base_aspect_ratio = 1.78) | |
# total by time of day only | |
d$timeonly <- as.numeric(d$datetime - trunc(d$datetime, "days")) | |
class(d$timeonly) <- "POSIXct" | |
p <- ggplot(data = d, aes(x = timeonly)) | |
time <- p + background_grid(major = "xy") + | |
# scale_x_continuous(breaks = 1:12) + | |
scale_x_datetime(date_breaks = "3 hours", | |
date_labels = "%H:%M") + | |
geom_histogram(colour = "skyblue2", fill = "skyblue1") + | |
labs(x = "Time of day (in the Bangkok timezone)", | |
y = "Number of tweets", | |
caption = "first 10,000 tweets from @asianturfgrass") | |
time | |
save_plot("~/Desktop/hour_tweets.png", time, base_aspect_ratio = 1.78) | |
# type of tweet | |
d$type <- "tweet" | |
d[(!is.na(d$retweeted_status_id)), 19] <- "retweet" | |
d[(!is.na(d$in_reply_to_status_id)), 19] <- "reply" | |
d$type <- as.factor(d$type) | |
d$type = factor(d$type, levels(d$type)[c(1,2,3)]) | |
# show proportion | |
p <- ggplot(data = d, aes(x = datetime)) | |
proportion <- p + background_grid(major = "xy") + | |
geom_histogram(position = "fill", | |
aes(colour = type, fill = type)) + | |
scale_x_datetime(date_breaks = "1 year", | |
date_labels = "%Y") + | |
scale_colour_brewer(palette = "Blues") + | |
scale_fill_brewer(palette = "Blues") + | |
labs(x = "Date", | |
y = "Proportion of tweets", | |
caption = "first 10,000 tweets from @asianturfgrass") + | |
theme(legend.title = element_blank(), | |
legend.position = "top") | |
proportion | |
save_plot("~/Desktop/proportion.png", proportion, base_aspect_ratio = 1.78) | |
# now look at the common words | |
# this from the Silge and Robinson Text Mining With R book | |
# and vignettes of the tidytext package | |
# https://www.tidytextmining.com/tidytext.html | |
library(tm) | |
library(tidytext) | |
d$text <- as.character(d$text) | |
tweet_text <- select(d, datetime, text) | |
# convert to one word per row | |
tweets_text <- tweet_text %>% | |
unnest_tokens(word, text) | |
# filter out the stop words | |
data(stop_words) | |
tweets_text <- tweets_text %>% | |
anti_join(stop_words) | |
words <- tweets_text %>% | |
count(word, sort = TRUE) | |
# manually remove some non-words | |
cleaned <- filter(words, word != "t.co" & | |
word != "http" & | |
word != "https" & | |
word != "rt" & | |
word != "amp" & | |
word != "2" & | |
word != "1" & | |
word != "3" & | |
word != "10" & | |
word != "4" & | |
word != "5") | |
# plot the counts of words | |
cleaned <- cleaned[with(cleaned, order(-n)), ] | |
forPlot <- cleaned[1:50, ] | |
forPlot$word <- reorder(forPlot$word, forPlot$n) | |
p <- ggplot(data = forPlot, aes(x = n, y = word)) | |
p + background_grid(major = "xy") + | |
geom_point(shape = 21, colour = "skyblue2", fill = "skyblue2", | |
size = 3) + | |
labs(x = "times used", | |
y = "word or username", | |
caption = "first 10,000 tweets from @asianturfgrass", | |
title = "Most frequently used words") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment