micahwoods/ten_thousand_analysis.R

## ten_thousand_analysis.R
# analyse my first 10000 tweets
# based on https://juliasilge.com/blog/ten-thousand-tweets/

library(ggplot2)
library(lubridate)
library(dplyr)
library(cowplot)

# read my tweet archive that I downloaded from Twitter
d <- read.csv("data/tweets.csv",
              header = TRUE)

d$datetime <- ymd_hms(as.character(d$timestamp))
d$datetime <- with_tz(d$datetime, tz = "Asia/Bangkok")

d$year <- year(d$datetime)
d$month <- month(d$datetime)
d$day <- day(d$datetime)
d$hour <- hour(d$datetime)
d$weekday <- wday(d$datetime, week_start = 1,
                  label = TRUE)

d$date <- ymd(paste(d$year, d$month, d$day))

# order these
d <- d[with(d, order(tweet_id)), ]

# select the first 10,000 tweets
d <- d[1:1e4, ]

# total by time
p <- ggplot(data = d, aes(x = datetime))
total <- p + background_grid(major = "xy") +
   scale_x_datetime(date_breaks = "1 year", date_labels = "%Y") +
   geom_histogram(colour = "skyblue2", fill = "skyblue1") +
   labs(x = "Time",
        y = "Number of tweets",
        caption = "first 10,000 tweets from @asianturfgrass")

total

save_plot("~/Desktop/total_tweets.png", total, base_aspect_ratio = 1.78)

by_year <- d %>%
  group_by(year) %>%
  summarise(total = length(text))

# total by year
p <- ggplot(data = d, aes(x = year))
annual <- p + background_grid(major = "xy") +
  scale_x_continuous(breaks = 2011:2018) +
  geom_histogram(colour = "skyblue2", fill = "skyblue1",
                 stat = "count") +
  labs(x = "Year",
       y = "Number of tweets",
       caption = "first 10,000 tweets from @asianturfgrass")

annual

save_plot("~/Desktop/annual_tweets.png", annual, base_aspect_ratio = 1.78)

# total by weekday
p <- ggplot(data = d, aes(x = weekday))
week <- p + background_grid(major = "xy") +
#  scale_x_continuous(breaks = 2011:2018) +
  geom_histogram(colour = "skyblue2", fill = "skyblue1",
                 stat = "count") +
  labs(x = "Day of week",
       y = "Number of tweets",
       caption = "first 10,000 tweets from @asianturfgrass")

week

save_plot("~/Desktop/week_tweets.png", week, base_aspect_ratio = 1.78)

# total by month
p <- ggplot(data = d, aes(x = month(datetime, label = TRUE)))
month <- p + background_grid(major = "xy") +
  # scale_x_continuous(breaks = 1:12) +
  geom_histogram(colour = "skyblue2", fill = "skyblue1",
                 stat = "count") +
  labs(x = "Month",
       y = "Number of tweets",
       caption = "first 10,000 tweets from @asianturfgrass")

month

save_plot("~/Desktop/month_tweets.png", month, base_aspect_ratio = 1.78)

 # total by time of day only
d$timeonly <- as.numeric(d$datetime - trunc(d$datetime, "days"))

class(d$timeonly) <- "POSIXct"

p <- ggplot(data = d, aes(x = timeonly))
time <- p + background_grid(major = "xy") +
  # scale_x_continuous(breaks = 1:12) +
  scale_x_datetime(date_breaks = "3 hours",
                   date_labels = "%H:%M") +
  geom_histogram(colour = "skyblue2", fill = "skyblue1") +
  labs(x = "Time of day (in the Bangkok timezone)",
       y = "Number of tweets",
       caption = "first 10,000 tweets from @asianturfgrass")

time

save_plot("~/Desktop/hour_tweets.png", time, base_aspect_ratio = 1.78)

# type of tweet
d$type <- "tweet"
d[(!is.na(d$retweeted_status_id)), 19] <- "retweet"
d[(!is.na(d$in_reply_to_status_id)), 19] <- "reply"
d$type <- as.factor(d$type)
d$type = factor(d$type, levels(d$type)[c(1,2,3)])

# show proportion
p <- ggplot(data = d, aes(x = datetime))
proportion <- p + background_grid(major = "xy") +
  geom_histogram(position = "fill",
             aes(colour = type, fill = type)) +
  scale_x_datetime(date_breaks = "1 year",
                   date_labels = "%Y") +
  scale_colour_brewer(palette = "Blues") +
  scale_fill_brewer(palette = "Blues") +
  labs(x = "Date",
       y = "Proportion of tweets",
       caption = "first 10,000 tweets from @asianturfgrass") +
  theme(legend.title = element_blank(),
        legend.position = "top")

proportion

save_plot("~/Desktop/proportion.png", proportion, base_aspect_ratio = 1.78)

# now look at the common words
# this from the Silge and Robinson Text Mining With R book
# and vignettes of the tidytext package
# https://www.tidytextmining.com/tidytext.html

library(tm)
library(tidytext)

d$text <- as.character(d$text)

tweet_text <- select(d, datetime, text)

# convert to one word per row
tweets_text <- tweet_text %>%
  unnest_tokens(word, text)

# filter out the stop words
data(stop_words)

tweets_text <- tweets_text %>%
  anti_join(stop_words)

words <- tweets_text %>%
  count(word, sort = TRUE)

# manually remove some non-words
cleaned <- filter(words, word != "t.co" &
                    word != "http" &
                    word != "https" &
                    word != "rt" &
                    word != "amp" &
                    word != "2" &
                    word != "1" &
                    word != "3" &
                    word != "10" &
                    word != "4" &
                    word != "5")

# plot the counts of words
cleaned <- cleaned[with(cleaned, order(-n)), ]
forPlot <- cleaned[1:50, ]

forPlot$word <- reorder(forPlot$word, forPlot$n)

p <- ggplot(data = forPlot, aes(x = n, y = word))
p + background_grid(major = "xy") +
  geom_point(shape = 21, colour = "skyblue2", fill = "skyblue2",
             size = 3) +
  labs(x = "times used",
       y = "word or username",
       caption = "first 10,000 tweets from @asianturfgrass",
       title = "Most frequently used words")
	# analyse my first 10000 tweets
	# based on https://juliasilge.com/blog/ten-thousand-tweets/

	library(ggplot2)
	library(lubridate)
	library(dplyr)
	library(cowplot)

	# read my tweet archive that I downloaded from Twitter
	d <- read.csv("data/tweets.csv",
	header = TRUE)

	d$datetime <- ymd_hms(as.character(d$timestamp))
	d$datetime <- with_tz(d$datetime, tz = "Asia/Bangkok")

	d$year <- year(d$datetime)
	d$month <- month(d$datetime)
	d$day <- day(d$datetime)
	d$hour <- hour(d$datetime)
	d$weekday <- wday(d$datetime, week_start = 1,
	label = TRUE)

	d$date <- ymd(paste(d$year, d$month, d$day))

	# order these
	d <- d[with(d, order(tweet_id)), ]

	# select the first 10,000 tweets
	d <- d[1:1e4, ]

	# total by time
	p <- ggplot(data = d, aes(x = datetime))
	total <- p + background_grid(major = "xy") +
	scale_x_datetime(date_breaks = "1 year", date_labels = "%Y") +
	geom_histogram(colour = "skyblue2", fill = "skyblue1") +
	labs(x = "Time",
	y = "Number of tweets",
	caption = "first 10,000 tweets from @asianturfgrass")

	total

	save_plot("~/Desktop/total_tweets.png", total, base_aspect_ratio = 1.78)

	by_year <- d %>%
	group_by(year) %>%
	summarise(total = length(text))

	# total by year
	p <- ggplot(data = d, aes(x = year))
	annual <- p + background_grid(major = "xy") +
	scale_x_continuous(breaks = 2011:2018) +
	geom_histogram(colour = "skyblue2", fill = "skyblue1",
	stat = "count") +
	labs(x = "Year",
	y = "Number of tweets",
	caption = "first 10,000 tweets from @asianturfgrass")

	annual

	save_plot("~/Desktop/annual_tweets.png", annual, base_aspect_ratio = 1.78)

	# total by weekday
	p <- ggplot(data = d, aes(x = weekday))
	week <- p + background_grid(major = "xy") +
	# scale_x_continuous(breaks = 2011:2018) +
	geom_histogram(colour = "skyblue2", fill = "skyblue1",
	stat = "count") +
	labs(x = "Day of week",
	y = "Number of tweets",
	caption = "first 10,000 tweets from @asianturfgrass")

	week

	save_plot("~/Desktop/week_tweets.png", week, base_aspect_ratio = 1.78)

	# total by month
	p <- ggplot(data = d, aes(x = month(datetime, label = TRUE)))
	month <- p + background_grid(major = "xy") +
	# scale_x_continuous(breaks = 1:12) +
	geom_histogram(colour = "skyblue2", fill = "skyblue1",
	stat = "count") +
	labs(x = "Month",
	y = "Number of tweets",
	caption = "first 10,000 tweets from @asianturfgrass")

	month

	save_plot("~/Desktop/month_tweets.png", month, base_aspect_ratio = 1.78)

	# total by time of day only
	d$timeonly <- as.numeric(d$datetime - trunc(d$datetime, "days"))

	class(d$timeonly) <- "POSIXct"

	p <- ggplot(data = d, aes(x = timeonly))
	time <- p + background_grid(major = "xy") +
	# scale_x_continuous(breaks = 1:12) +
	scale_x_datetime(date_breaks = "3 hours",
	date_labels = "%H:%M") +
	geom_histogram(colour = "skyblue2", fill = "skyblue1") +
	labs(x = "Time of day (in the Bangkok timezone)",
	y = "Number of tweets",
	caption = "first 10,000 tweets from @asianturfgrass")

	time

	save_plot("~/Desktop/hour_tweets.png", time, base_aspect_ratio = 1.78)

	# type of tweet
	d$type <- "tweet"
	d[(!is.na(d$retweeted_status_id)), 19] <- "retweet"
	d[(!is.na(d$in_reply_to_status_id)), 19] <- "reply"
	d$type <- as.factor(d$type)
	d$type = factor(d$type, levels(d$type)[c(1,2,3)])

	# show proportion
	p <- ggplot(data = d, aes(x = datetime))
	proportion <- p + background_grid(major = "xy") +
	geom_histogram(position = "fill",
	aes(colour = type, fill = type)) +
	scale_x_datetime(date_breaks = "1 year",
	date_labels = "%Y") +
	scale_colour_brewer(palette = "Blues") +
	scale_fill_brewer(palette = "Blues") +
	labs(x = "Date",
	y = "Proportion of tweets",
	caption = "first 10,000 tweets from @asianturfgrass") +
	theme(legend.title = element_blank(),
	legend.position = "top")

	proportion

	save_plot("~/Desktop/proportion.png", proportion, base_aspect_ratio = 1.78)

	# now look at the common words
	# this from the Silge and Robinson Text Mining With R book
	# and vignettes of the tidytext package
	# https://www.tidytextmining.com/tidytext.html

	library(tm)
	library(tidytext)

	d$text <- as.character(d$text)

	tweet_text <- select(d, datetime, text)

	# convert to one word per row
	tweets_text <- tweet_text %>%
	unnest_tokens(word, text)

	# filter out the stop words
	data(stop_words)

	tweets_text <- tweets_text %>%
	anti_join(stop_words)

	words <- tweets_text %>%
	count(word, sort = TRUE)

	# manually remove some non-words
	cleaned <- filter(words, word != "t.co" &
	word != "http" &
	word != "https" &
	word != "rt" &
	word != "amp" &
	word != "2" &
	word != "1" &
	word != "3" &
	word != "10" &
	word != "4" &
	word != "5")

	# plot the counts of words
	cleaned <- cleaned[with(cleaned, order(-n)), ]
	forPlot <- cleaned[1:50, ]

	forPlot$word <- reorder(forPlot$word, forPlot$n)

	p <- ggplot(data = forPlot, aes(x = n, y = word))
	p + background_grid(major = "xy") +
	geom_point(shape = 21, colour = "skyblue2", fill = "skyblue2",
	size = 3) +
	labs(x = "times used",
	y = "word or username",
	caption = "first 10,000 tweets from @asianturfgrass",
	title = "Most frequently used words")