Skip to content

Instantly share code, notes, and snippets.

@manualwise
Forked from ttunguz/analyze_twitter_organic.r
Last active January 30, 2017 17:08
Show Gist options
  • Save manualwise/c92fda9d884f47f43794 to your computer and use it in GitHub Desktop.
Save manualwise/c92fda9d884f47f43794 to your computer and use it in GitHub Desktop.
### ABOUT THIS SCRIPT
# This script uses your Twitter Data and will output charts informing you about Engagement Rates, Click Activities etc. by "Time of Day" and "Day of Week".
# You can get your Twitter Data if you have an offical Twitter Ads account: http://ads.twitter.com/user/*/tweets
# The above link only works if you already have a Twitter Ads account.
# More information can be found in the original blog post about this script (by Tomasz Tunguz): http://tomtunguz.com/twitter-best-practices/
### CONTRIBUTIONS
# Original author: Tomasz Tunguz – https://github.com/ttunguz
# Improved labeling and configuration, extended documentation: Clemens Kofler and Manuel Weiss – https://github.com/clemens + https://github.com/manualwise
### INSTRUCTIONS
# 1. Download this file and put it in the same folder as your tweet_activity_metrics.csv (e.g. /Users/username/Documents/twitter-data/)
# 2. If you need to install R you can download it here: http://cran.r-project.org/bin/
# 3. Go into your installation folder and start R
# 4. You need the following packages to run this script: ggplot2, reshape, plyr, scales. Install them by executing install.packages(c("ggplot2", "reshape", "plyr", "scales")) in the R prompt.
# 5. Edit the CONFIGURATION VARIABLES according to your needs
# Bonus: In each top_n_for_hour or top_n_for_day, test to statistical significance by comparing the result of the top_n function and the result of the matrix t test
### THESE ARE YOUR INSTALLED R PACKAGES
library(ggplot2)
library(reshape)
library(plyr)
library(scales)
### CONFIGURATION VARIABLES
target_time_zone = "America/New_York"
data_directory = "/Users/username/Documents/twitter-data/" # Important: This must end with a /
start_hour = 0
end_hour = 23
### FUNCTIONS
full_path = function(filename){
paste(data_directory, filename, sep="")
}
x_axis_label_hourly = paste("Time of day (", target_time_zone, ")", sep="")
x_axis_label_daily = "Day of week"
#IMPORT DATA
data = read.csv(full_path("tweet_activity_metrics.csv"))
data$time = as.POSIXct(data$time, tz="UTC")
data$time = format(data$time, tz=target_time_zone)
data$hour = as.POSIXlt(strftime(data$time, format="%H:%M"), format="%H:%M")$hour
data = subset(data, hour>=start_hour)
data = subset(data, hour<=end_hour)
## BASICS
## GENERAL PLOTS FOR HOUR
ggplot(data) + geom_point(aes(data$hour, data$engagement.rate), size=5, alpha=0.7, colour="red", position="jitter") + xlab("") + ylab("Engagement Rate") + ggtitle("Engagement Rate by Time of Day")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + scale_colour_manual(values = c("red", "dodgerblue4")) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8) + scale_y_continuous( labels = percent_format())
ggsave(full_path("engagementrate_by_timeofday.png"), dpi=300, width=9, height=6)
## TIME OF DAY FUNCTIONS
top_n_for_hour = function(df, n){
ordered = df[order(df$average, decreasing=TRUE),][1:n,]
return (ordered)
}
time_of_day_t = function(df, field, top_hours){
x = data.frame(matrix(NA, nrow = 25, ncol = 25))
range = seq(start_hour+1:end_hour+1)
colnames(x) = seq(1:25)
rownames(x) = seq(1:25)
for (i in range){
current_hour = subset(df, df$hour == i)
for (j in range){
testing_hour = subset(df, df$hour == j)
if (is.data.frame(testing_hour) && nrow(testing_hour) > 1 && nrow(current_hour)>1){
x[i, j] = t.test(current_hour[field], testing_hour[field])$p.value
}
else {
x[i, j] = NA
}
}
}
x$hour1 = factor(rownames(x))
x = melt(x)
x = subset(x, value < 0.05)
x = subset(x, hour1 %in% top_hours)
x = subset(x, variable %in% top_hours)
return (x)
}
designate_top = function(hourly_data, top_hours){
hourly_data$top = 0
hourly_data$top[hourly_data$hour %in% top_hours]=1
return (hourly_data)
}
## RETWEETS BY HOUR
rt_by_hour = ddply(data, .(hour), summarise, average=mean(retweets), count_posts = length(Tweet.id))
top_retweet_hours = top_n_for_hour(rt_by_hour, 3)$hour
rt_by_hour = designate_top(rt_by_hour, top_retweet_hours)
retweet_t = time_of_day_t(data, "retweets", top_retweet_hours)
## 8 and 9 are the best hours
ggplot(rt_by_hour) + geom_bar(aes(hour, average, fill=top), stat="identity")+ xlab(x_axis_label_hourly) + ylab("Retweets per Post") + ggtitle("Best Time of Day to Maximize RT")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + scale_fill_gradient2(low="dodgerblue", high="red", mid="dodgerblue") + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8)+ theme(legend.position="none")
ggsave(full_path("maximum_retweets_by_timeofday.png"), dpi=300, width=12, height=9)
## IMPRESSIONS BY HOUR
imp_by_hour = ddply(data, .(hour), summarise, average = mean(impressions), count = length(Tweet.id))
top_imp_hours = top_n_for_hour(imp_by_hour, 5)$hour
imp_by_hour = designate_top(imp_by_hour, top_imp_hours)
imp_t = time_of_day_t(data, "impressions", top_imp_hours)
ggplot(imp_by_hour) + geom_bar(aes(hour, average, fill=top), stat="identity")+ xlab(x_axis_label_hourly) + ylab("Impressions") + ggtitle("Best Time of Day to Maximize Impressions")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + scale_colour_manual(values = c("red", "dodgerblue4")) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ scale_fill_gradient2(low="dodgerblue", high="red", mid="dodgerblue") +annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8)+ theme(legend.position="none")
ggsave(full_path("maximum_impressions_by_timeofday.png"), dpi=300, width=12, height=9)
## ENGAGEMENT RATE BY HOUR
er_by_hour = ddply(data, .(hour), summarise, average = mean(engagement.rate))
top_er_hours = top_n_for_hour(er_by_hour, 11)$hour
er_by_hour = designate_top(er_by_hour, top_er_hours)
er_t = time_of_day_t(data, "engagement.rate", top_er_hours)
ggplot(er_by_hour) + geom_bar(aes(hour, average, fill=top), stat="identity")+ xlab(x_axis_label_hourly) + ylab("Engagement Rate") + ggtitle("Best Time of Day to Maximize Engagement Rate")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8)+ scale_fill_gradient2(low="dodgerblue", high="red", mid="dodgerblue") + theme(legend.position="none")+ scale_y_continuous( labels = percent_format())
ggsave(full_path("engagementrate_by_timeofday.png"), dpi=300, width=12, height=9)
## URL CLICKS BY HOUR
data$url.ctr = data$url.clicks/data$impressions
url_by_hour = ddply(data, .(hour), summarise, average = sum(url.clicks)/sum(impressions),url.ctr = sum(url.clicks)/sum(impressions))
top_url_hours = top_n_for_hour(url_by_hour, 4)$hour
url_by_hour = designate_top(url_by_hour, top_url_hours)
url_t = time_of_day_t(data, "url.ctr", top_url_hours)
ggplot(url_by_hour) + geom_bar(aes(hour, average, fill=top), stat="identity")+ xlab(x_axis_label_hourly) + ylab("Click Rate") + ggtitle("Best Time of Day to Maximize Click Rate")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8)+ scale_fill_gradient2(low="dodgerblue", high="red", mid="dodgerblue") + theme(legend.position="none")+ scale_y_continuous( labels = percent_format())
ggsave(full_path("url_clicks_by_timeofday.png"), dpi=300, width=12, height=9)
## FOLLOWS BY HOUR
follows_by_hour = ddply(data, .(hour), summarise, average = mean(follows))
top_follows_hours = top_n_for_hour(follows_by_hour, 4)$hour
follows_by_hour = designate_top(follows_by_hour, top_follows_hours)
follows_t = time_of_day_t(data, "follows", top_follows_hours)
ggplot(follows_by_hour) + geom_bar(aes(hour, average, fill=top), stat="identity")+ xlab(x_axis_label_hourly) + ylab("Follows") + ggtitle("Best Time of Day to New Follows")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8)+ scale_fill_gradient2(low="dodgerblue", high="red", mid="dodgerblue") + theme(legend.position="none")+ scale_y_continuous( labels = percent_format())
ggsave(full_path("follow_rate_by_timeofday.png"), dpi=300, width=12, height=9)
### DAY OF WEEK ANALYSIS
## DAY OF WEEK FUNCTIONS
data$day = weekdays(as.Date(data$time))
data$weekday = as.POSIXlt(data$time)$wday
weekday_t = function(df, field, top_days){
x = data.frame(matrix(NA, nrow = 7, ncol = 7))
colnames(x) = seq(0:6)
rownames(x) = seq(0:6)
for (i in 1:7){
current_day = subset(df, df$weekday == i-1)
for (j in 1:7){
testing_day = subset(df, df$day == j-1)
if (is.data.frame(testing_day) && nrow(testing_day) > 1 && nrow(current_day)>1){
x[i, j] = t.test(current_day[field], testing_day[field])$p.value
}
else {
x[i, j] = NA
}
}
}
x$day = factor(rownames(x))
x = melt(x)
x = subset(x, value < 0.05)
x = subset(x, day %in% top_days)
x = subset(x, variable %in% top_days)
return (x)
}
top_n_for_day = function(df, n){
ordered = df[order(df$average, decreasing=TRUE),][1:n,]
return (ordered)
}
## PLOT HIGH LEVEL DATA
ggplot(data) + geom_point(aes(data$weekday, data$engagement.rate), size=5, alpha=0.7, colour="dodgerblue", position="jitter") + xlab("") + ylab("Engagement Rate") + ggtitle("Engagement Rate by Day of Week")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + scale_colour_manual(values = c("red", "dodgerblue4")) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8) + scale_y_continuous( labels = percent_format())
ggplot(data) + geom_boxplot(aes(data$weekday, data$impressions,outlier.color="gray", group=data$weekday), fill="orange", colour="gray", outlier.colour="gray50", outlier.size=3) + xlab("") + ylab("Impressions") + ggtitle("Impressions by Day of Week")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + scale_colour_manual(values = c("red", "dodgerblue4")) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8)+ scale_y_continuous( labels = comma_format())
## URL CLICKS BY DAY
data$url.ctr = data$url.clicks/data$impressions
url_by_day = ddply(data, .(weekday), summarise, average = sum(url.clicks)/sum(impressions),url.ctr = sum(url.clicks)/sum(impressions), count = length(url.clicks))
top_url_day = top_n_for_day(url_by_day, 7)$weekday
url_by_day = designate_top(url_by_day, top_url_day)
day_url_t = weekday_t(data, "url.ctr", top_url_day)
ggplot(url_by_day) + geom_bar(aes(weekday, average, fill=top), stat="identity")+ xlab(x_axis_label_daily) + ylab("Click Rate") + ggtitle("Best Day of Week to Maximize Click Rate")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8)+ scale_fill_gradient2(low="dodgerblue", high="red", mid="dodgerblue") + theme(legend.position="none")+ scale_y_continuous( labels = percent_format())
ggsave(full_path("url_clicks_by_day.png"), dpi=300, width=12, height=9)
## ENGAGEMENT RATE BY DAY
er_by_day = ddply(data, .(weekday), summarise, average = mean(engagement.rate))
top_er_day = top_n_for_day(er_by_day, 7)$weekday
er_by_day = designate_top(er_by_day, top_er_day)
day_er_t = weekday_t(data, "engagement.rate", top_er_day)
ggplot(er_by_day) + geom_bar(aes(weekday, average, fill=top), stat="identity")+ xlab(x_axis_label_daily) + ylab("Engagement Rate") + ggtitle("Best Day of Week to Maximize Engagement Rate")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8)+ scale_fill_gradient2(low="dodgerblue", high="red", mid="dodgerblue") + theme(legend.position="none")+ scale_y_continuous( labels = percent_format())
ggsave(full_path("engagementrate_by_day.png"), dpi=300, width=12, height=9)
## IMPRESSIONS BY DAY
imp_by_day = ddply(data, .(weekday), summarise, average = mean(impressions))
top_imp_day = top_n_for_day(imp_by_day, 7)$weekday
imp_by_day = designate_top(imp_by_day, top_imp_day)
day_imp_t = weekday_t(data, "impressions", top_imp_day)
ggplot(imp_by_day) + geom_bar(aes(weekday, average, fill=top), stat="identity")+ xlab(x_axis_label_daily) + ylab("Impressions") + ggtitle("Best Day of Week to Maximize Impressions")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8)+ scale_fill_gradient2(low="dodgerblue", high="red", mid="dodgerblue") + theme(legend.position="none")+ scale_y_continuous( labels = comma_format())
ggsave(full_path("maximum_impressions_by_day.png"), dpi=300, width=12, height=9)
### CORRELATIONS
cor(data$impressions, data$engagement.rate)
cor(data$retweets, data$impressions)
cor(data$replies, data$impressions)
data$tweet.length = nchar(as.character(data$Tweet.text[1]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment