Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
makes some counts of tweets and words from an archive downloaded from Twitter
# analyse my first 10000 tweets
# based on https://juliasilge.com/blog/ten-thousand-tweets/
library(ggplot2)
library(lubridate)
library(dplyr)
library(cowplot)
# read my tweet archive that I downloaded from Twitter
d <- read.csv("data/tweets.csv",
header = TRUE)
d$datetime <- ymd_hms(as.character(d$timestamp))
d$datetime <- with_tz(d$datetime, tz = "Asia/Bangkok")
d$year <- year(d$datetime)
d$month <- month(d$datetime)
d$day <- day(d$datetime)
d$hour <- hour(d$datetime)
d$weekday <- wday(d$datetime, week_start = 1,
label = TRUE)
d$date <- ymd(paste(d$year, d$month, d$day))
# order these
d <- d[with(d, order(tweet_id)), ]
# select the first 10,000 tweets
d <- d[1:1e4, ]
# total by time
p <- ggplot(data = d, aes(x = datetime))
total <- p + background_grid(major = "xy") +
scale_x_datetime(date_breaks = "1 year", date_labels = "%Y") +
geom_histogram(colour = "skyblue2", fill = "skyblue1") +
labs(x = "Time",
y = "Number of tweets",
caption = "first 10,000 tweets from @asianturfgrass")
total
save_plot("~/Desktop/total_tweets.png", total, base_aspect_ratio = 1.78)
by_year <- d %>%
group_by(year) %>%
summarise(total = length(text))
# total by year
p <- ggplot(data = d, aes(x = year))
annual <- p + background_grid(major = "xy") +
scale_x_continuous(breaks = 2011:2018) +
geom_histogram(colour = "skyblue2", fill = "skyblue1",
stat = "count") +
labs(x = "Year",
y = "Number of tweets",
caption = "first 10,000 tweets from @asianturfgrass")
annual
save_plot("~/Desktop/annual_tweets.png", annual, base_aspect_ratio = 1.78)
# total by weekday
p <- ggplot(data = d, aes(x = weekday))
week <- p + background_grid(major = "xy") +
# scale_x_continuous(breaks = 2011:2018) +
geom_histogram(colour = "skyblue2", fill = "skyblue1",
stat = "count") +
labs(x = "Day of week",
y = "Number of tweets",
caption = "first 10,000 tweets from @asianturfgrass")
week
save_plot("~/Desktop/week_tweets.png", week, base_aspect_ratio = 1.78)
# total by month
p <- ggplot(data = d, aes(x = month(datetime, label = TRUE)))
month <- p + background_grid(major = "xy") +
# scale_x_continuous(breaks = 1:12) +
geom_histogram(colour = "skyblue2", fill = "skyblue1",
stat = "count") +
labs(x = "Month",
y = "Number of tweets",
caption = "first 10,000 tweets from @asianturfgrass")
month
save_plot("~/Desktop/month_tweets.png", month, base_aspect_ratio = 1.78)
# total by time of day only
d$timeonly <- as.numeric(d$datetime - trunc(d$datetime, "days"))
class(d$timeonly) <- "POSIXct"
p <- ggplot(data = d, aes(x = timeonly))
time <- p + background_grid(major = "xy") +
# scale_x_continuous(breaks = 1:12) +
scale_x_datetime(date_breaks = "3 hours",
date_labels = "%H:%M") +
geom_histogram(colour = "skyblue2", fill = "skyblue1") +
labs(x = "Time of day (in the Bangkok timezone)",
y = "Number of tweets",
caption = "first 10,000 tweets from @asianturfgrass")
time
save_plot("~/Desktop/hour_tweets.png", time, base_aspect_ratio = 1.78)
# type of tweet
d$type <- "tweet"
d[(!is.na(d$retweeted_status_id)), 19] <- "retweet"
d[(!is.na(d$in_reply_to_status_id)), 19] <- "reply"
d$type <- as.factor(d$type)
d$type = factor(d$type, levels(d$type)[c(1,2,3)])
# show proportion
p <- ggplot(data = d, aes(x = datetime))
proportion <- p + background_grid(major = "xy") +
geom_histogram(position = "fill",
aes(colour = type, fill = type)) +
scale_x_datetime(date_breaks = "1 year",
date_labels = "%Y") +
scale_colour_brewer(palette = "Blues") +
scale_fill_brewer(palette = "Blues") +
labs(x = "Date",
y = "Proportion of tweets",
caption = "first 10,000 tweets from @asianturfgrass") +
theme(legend.title = element_blank(),
legend.position = "top")
proportion
save_plot("~/Desktop/proportion.png", proportion, base_aspect_ratio = 1.78)
# now look at the common words
# this from the Silge and Robinson Text Mining With R book
# and vignettes of the tidytext package
# https://www.tidytextmining.com/tidytext.html
library(tm)
library(tidytext)
d$text <- as.character(d$text)
tweet_text <- select(d, datetime, text)
# convert to one word per row
tweets_text <- tweet_text %>%
unnest_tokens(word, text)
# filter out the stop words
data(stop_words)
tweets_text <- tweets_text %>%
anti_join(stop_words)
words <- tweets_text %>%
count(word, sort = TRUE)
# manually remove some non-words
cleaned <- filter(words, word != "t.co" &
word != "http" &
word != "https" &
word != "rt" &
word != "amp" &
word != "2" &
word != "1" &
word != "3" &
word != "10" &
word != "4" &
word != "5")
# plot the counts of words
cleaned <- cleaned[with(cleaned, order(-n)), ]
forPlot <- cleaned[1:50, ]
forPlot$word <- reorder(forPlot$word, forPlot$n)
p <- ggplot(data = forPlot, aes(x = n, y = word))
p + background_grid(major = "xy") +
geom_point(shape = 21, colour = "skyblue2", fill = "skyblue2",
size = 3) +
labs(x = "times used",
y = "word or username",
caption = "first 10,000 tweets from @asianturfgrass",
title = "Most frequently used words")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment