Skip to content

Instantly share code, notes, and snippets.

@jwinternheimer
Created July 13, 2015 07:34
Show Gist options
  • Save jwinternheimer/3ccdf7c07fc0f1f524a3 to your computer and use it in GitHub Desktop.
Save jwinternheimer/3ccdf7c07fc0f1f524a3 to your computer and use it in GitHub Desktop.
Update Text Analysis
library(data.table); library(dplyr); library(tidyr)
library(ggplot2); library(scales); library(grid); library(RColorBrewer)
## Import Dataset
updates <- read.csv("~/Downloads/updates.csv",header=T)
names(updates) <- c("update_id","profile_id","num_chars","num_explanation","num_hashtags",
"hour_sent","num_followers","follower_tier","link","link_no_content",
"photo","text","interactions","retweets","favorites","mentions",
"clicks","avg_interaction_length")
## Clean the text and count characters
updates <- updates %>%
mutate(clean_text = clean.text(text)) %>%
mutate(text_length = nchar(as.character(text)),
clean_text_length = nchar(as.character(clean_text)),
engagement = retweets+favorites+mentions,
engagement_per_follower = (retweets+favorites+mentions)/num_followers)
## Set hour variable as a factor
updates$hour_sent <- as.factor(updates$hour_sent)
## Manually create text length tiers
updates <- updates %>%
mutate(text_length_tier = ifelse(text_length < 20, "T1 (0,20)",
ifelse(text_length < 40, "T2 [20,40)",
ifelse(text_length < 60, "T3 [40,60)",
ifelse(text_length < 80, "T4 [60,80)",
ifelse(text_length < 100, "T5 [80,100)",
ifelse(text_length < 120, "T6 [100,120)",
ifelse(text_length <= 140, "T7 [120,140]","T1 (0,20)"))))))))
## Manually create tiers for clean text length
updates <- updates %>%
mutate(clean_text_length_tier = ifelse(clean_text_length < 20, "T1 (0,20)",
ifelse(clean_text_length < 40, "T2 [20,40)",
ifelse(clean_text_length < 60, "T3 [40,60)",
ifelse(clean_text_length < 80, "T4 [60,80)",
ifelse(clean_text_length < 100, "T5 [80,100)",
ifelse(clean_text_length < 120, "T6 [100,120)",
ifelse(clean_text_length <= 140, "T7 [120,140]","Undefined"))))))))
## Add Clean Text Length Tier as Factor
updates <- updates %>%
mutate(text_length_tier = as.factor(text_length_tier),
clean_text_length_tier = as.factor(clean_text_length_tier))
## Save Data
save(updates,file="~/Google Drive/R_data/updates.Rda")
#################################################
## Engagement Visualization
#################################################
## Engagement CDF
engagement_cdf <- ggplot(updates,aes(x=engagement)) +
stat_ecdf(size=1,color="#547c9f") +
theme_minimal() +
coord_cartesian(xlim=c(0,200)) +
scale_y_continuous(breaks=seq(0,1,0.2)) +
labs(x="Engagement",y="Percent of Tweets",title="Engagement CDF")
## Engagement CDF by Follower Tier
engagement_cdf_followers <- ggplot(filter(updates,follower_tier != "T00 (-inf,1)"),aes(x=engagement, color=follower_tier)) +
stat_ecdf(size=1) +
theme_minimal() +
coord_cartesian(xlim=c(0,200)) +
scale_y_continuous(breaks=seq(0,1,0.2)) +
labs(x="Engagemnet",y="Percent of Tweets",title="Engagement CDF")
## Engagement Per Follower CDF
EpF_cdf <- ggplot(updates,aes(x=engagement_per_follower)) +
stat_ecdf(size=1,color="#547c9f") +
coord_cartesian(xlim=c(0,0.015)) +
theme_minimal() +
scale_y_continuous(breaks=seq(0,1,0.2)) +
labs(x="Engagement Per Follower",y="Percent of Tweets",title="Engagement Per Follower CDF")
## Engagement Per Follower Density
EpF_density <- ggplot(updates,aes(x=engagement_per_follower)) +
geom_density(alpha=0.8,fill="#547c9f",color="#547c9f") +
buffer_theme() +
scale_x_continuous(limits=c(0,0.005),expand=c(0,0.0001)) +
scale_y_continuous(breaks=seq(0,1500,500),expand=c(0,0)) +
labs(x="Engagement Per Follower", y="Number of Tweets", title="Engagement Per Follower Density") +
theme(text = element_text(size=15))
#################################################
## Text Length
#################################################
## CDF of Tweet Lengths
tweet_length_cdf <- ggplot(updates,aes(x=clean_text_length,color=link)) +
stat_ecdf(size=1) + theme_minimal() + coord_cartesian(xlim=c(0,150)) +
scale_x_continuous(breaks=seq(0,150,25)) +
labs(x="Clean Text Length",y="Percent of Tweets",color="Includes Link?")
## Density Plot of Tweet Lengths
tweet_length_density <- ggplot(updates,aes(x=text_length)) +
geom_density(color="#547c9f",alpha=0.9,fill="#547c9f") +
scale_x_continuous(limits=c(0,140),breaks=seq(0,140,10),expand=c(0,2)) +
scale_y_continuous(limits=c(0,0.02),expand=c(0,0)) +
buffer_theme() +
labs(x="Tweet Length", y="Percent of Tweets", title="Tweet Length Density",fill="Includes Link?")
## Clean Text Length Density
clean_tweet_length_density <- ggplot(updates,aes(x=clean_text_length)) +
geom_density(alpha=0.9,color="#547c9f",fill="#547c9f") +
buffer_theme() +
scale_x_continuous(limits=c(0,140),breaks=seq(0,140,10),expand=c(0,2)) +
labs(x="Tweet Length", y="Percent of Tweets", title="Clean Tweet Length Density",fill="Includes Link?") +
scale_y_continuous(expand=c(0,0))
## Tweet Length vs. Engagement Scatter Plot
tweet_length_scatter <- ggplot(filter(updates,text_length > 0 & num_followers > 0), aes(x=text_length,y=engagement_per_follower)) +
geom_point(size=1,alpha=0.3,position="jitter",colour="grey60") +
scale_x_continuous(limits=c(0,140),breaks=seq(0,140,20),expand=c(0,2)) +
scale_y_continuous(expand=c(0,0),limits=c(0,0.015)) +
geom_smooth(aes(group=1),size=1,se=T,color="#547c9f") +
labs(x="Tweet Length", y="",title="Tweet Engagement by Character Count") +
buffer_theme()
## Tweet Length vs. Engagement Scatter Plot by Link
tweet_length_scatter <- ggplot(filter(updates,text_length > 0 & num_followers > 0), aes(x=text_length,y=engagement_per_follower,color=link)) +
geom_point(size=1,alpha=0.3,position="jitter") +
scale_x_continuous(limits=c(0,140),breaks=seq(0,140,20)) +
ylim(0,0.015) +
#scale_y_continuous(limits=c(0,0.015),breaks=seq(0,0.015,0.005))
stat_smooth(size=2,se=T) +
labs(x="Tweet Length", y="Engagement",title="Tweet Engagement by Character Count",color="Includes Link?") +
theme_minimal()
## Group Data by Text Length Tier
by_text_length_tier <- updates %>%
filter(num_followers >0 & !is.na(engagement_per_follower)) %>%
group_by(text_length_tier) %>%
summarise(tweets=n(),avg_interactions = mean(interactions), avg_retweets = mean(retweets),
avg_favorites = mean(favorites), avg_mentions = mean(mentions),
avg_clicks = mean(clicks),avg_engagement = mean(engagement),
avg_engagement_per_follower = mean(engagement_per_follower,na.rm=T))
## Bar Plot of Average Engagement by Text Length Tier
engagement_by_text_length_tier <- ggplot(by_text_length_tier, aes(x=text_length_tier, y=avg_engagement_per_follower)) +
geom_bar(stat="identity",fill="#547c9f") +
labs(x="Text Length Tier", y="Average Engagement Per Follower",title="Average Engagement Per Follower") +
buffer_theme() + scale_y_continuous(expand=c(0,0))
#################################################
## Links
#################################################
## Group Data by Inclusion of Link
by_link <- updates %>%
filter(num_followers >0 & !is.na(engagement_per_follower)) %>%
group_by(link) %>%
summarise(tweets=n(),percent=round(n()/750776,3),avg_interactions = mean(interactions), avg_retweets = mean(retweets),
avg_favorites = mean(favorites), avg_mentions = mean(mentions),
avg_clicks = mean(clicks),avg_engagement = mean(engagement),
avg_engagement_per_follower = mean(engagement_per_follower,na.rm=T))
## Group by Link and Follower Tier
by_link_tier <- updates %>%
filter(num_followers >0 & !is.na(engagement_per_follower)) %>%
group_by(link,follower_tier) %>%
summarise(tweets=n(),percent=round(n()/750776,3),avg_interactions = mean(interactions), avg_retweets = mean(retweets),
avg_favorites = mean(favorites), avg_mentions = mean(mentions),
avg_clicks = mean(clicks),avg_engagement = mean(engagement),
avg_engagement_per_follower = mean(engagement_per_follower,na.rm=T))
## Plot Pie Chart of Inclusion of Link
ggplot(by_link,aes(x=factor(1),y=percent,fill=link)) +
geom_bar(stat="identity",width=1) +
coord_polar(theta="y") +
theme_minimal() +
#geom_text(aes(y = percent/2,2 + c(0, cumsum(percent)[-length(percent)]), label = percent), size=8) +
theme(axis.ticks = element_blank(), axis.text.y = element_blank(), axis.text.x = element_blank()) +
labs(x="",y="",title="Percent of Tweets With Links",fill="Includes Link?")
## Plot CDF of Engagement Per Follower, by Link
link_engagement_cdf <- ggplot(updates,aes(x=engagement_per_follower,color=link)) +
stat_ecdf(size=1) +
coord_cartesian(xlim=c(0,0.010)) +
scale_x_continuous(expand=c(0,0)) +
buffer_theme() +
scale_y_continuous(breaks=seq(0,1,0.2),expand=c(0,0)) +
labs(x="Engagement Per Follower",y="Percent of Tweets",title="Engagement Per Follower CDF",color="Includes Link?")
## Plot CDF of Engagement, by Link
engagement_cdf <- ggplot(updates,aes(x=engagement,color=link)) +
stat_ecdf(size=1) +
coord_cartesian(xlim=c(0,500)) +
theme_minimal() +
scale_y_continuous(breaks=seq(0,1,0.2)) +
labs(x="Total Engagement",y="Percent of Tweets",title="Total Engagement CDF",color="Includes Link?")
## Plot Average Engagement Per Follower by Inclusion of Link
engagement_by_link <- ggplot(by_link, aes(x=link, y=avg_engagement_per_follower)) +
geom_bar(stat="identity",fill="#547c9f") +
labs(x="Includes Link",y="",title="Average Engagement Per Follower",fill="Includes Link?") +
buffer_theme() +
scale_y_continuous(limits=c(0,0.0025),expand=c(0,0))
## Plot Average Engagement Per Follower by Inclusion of Link
ggplot(by_link, aes(x=link, y=avg_engagement)) +
geom_bar(stat="identity") +
scale_y_continuous(limits=c(0,400),expand=c(0,0)) +
labs(x="Includes Link",y="",title="Average Total Engagement",fill="Includes Link?") +
buffer_theme()
## Plot Average Engagement by Inclusion of Link, by Follower Tier
engagement_by_link <- ggplot(filter(by_link_tier,follower_tier != "T12 [200000,inf)"), aes(x=follower_tier, y=avg_engagement,fill=link)) +
geom_bar(stat="identity",position="dodge") +
labs(x="Includes Link", y="Average Engagement",title="Average Engagement by Inclusion of Link",fill="Includes Link?") +
theme_minimal() + scale_y_continuous(expand=c(0,0)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#################################################
## Images
#################################################
## Group Data by Inclusion of Image
by_image <- updates %>%
filter(num_followers > 0 & !is.na(engagement_per_follower)) %>%
group_by(photo) %>%
summarise(tweets=n(),percent=round(n()/750776,3),avg_interactions = mean(interactions), avg_retweets = mean(retweets),
avg_favorites = mean(favorites), avg_mentions = mean(mentions),
avg_clicks = mean(clicks),avg_engagement = mean(engagement),
avg_engagement_per_follower = mean(engagement_per_follower,na.rm=T))
## Group by Image and Follower Tier
by_image_tier <- updates %>%
filter(num_followers >0 & !is.na(engagement_per_follower)) %>%
group_by(photo,follower_tier) %>%
summarise(tweets=n(),percent=round(n()/750776,3),avg_interactions = mean(interactions), avg_retweets = mean(retweets),
avg_favorites = mean(favorites), avg_mentions = mean(mentions),
avg_clicks = mean(clicks),avg_engagement = mean(engagement),
avg_engagement_per_follower = mean(engagement_per_follower,na.rm=T))
## Plot Pie Chart of Inclusion of Link
ggplot(by_image,aes(x=factor(1),y=percent,fill=photo)) +
geom_bar(stat="identity",width=1) +
coord_polar(theta="y") +
theme_minimal() +
theme(axis.ticks = element_blank(), axis.text.y = element_blank(), axis.text.x = element_blank()) +
labs(x="",y="",title="Percent of Tweets With Image",fill="Includes Image?")
## Plot CDF of Engagement Per Follower, by Image
image_engagement_cdf <- ggplot(updates,aes(x=engagement_per_follower,color=photo)) +
stat_ecdf(size=1) +
coord_cartesian(xlim=c(0,0.0075)) +
theme_minimal() +
theme(text = element_text(size=25)) +
scale_y_continuous(breaks=seq(0,1,0.2)) +
labs(x="Engagement Per Follower",y="Percent of Tweets",title="Engagement Per Follower CDF",color="Includes Image?")
## Plot CDF of Total Engagement, by Image
total_image_engagement_cdf <- ggplot(updates,aes(x=engagement,color=photo)) +
stat_ecdf(size=1) +
coord_cartesian(xlim=c(0,400)) +
theme_minimal() +
scale_y_continuous(breaks=seq(0,1,0.2)) +
labs(x="Total Engagement",y="Percent of Tweets",title="Total Engagment CDF",color="Includes Image?")
## Plot Average Engagement Per Follower by Inclusion of Image
avg_engagement_image <- ggplot(by_image, aes(x=photo, y=avg_engagement_per_follower)) +
geom_bar(stat="identity",fill="#547c9f") +
labs(x="Includes Image",y="",title="Average Engagement Per Follower",fill="Includes Image?") +
buffer_theme() +
scale_y_continuous(limits=c(0,0.0025),expand=c(0,0))
## Plot Average Total Engagement Per Follower by Inclusion of Image
ggplot(by_image, aes(x=photo, y=avg_engagement)) +
geom_bar(stat="identity") +
scale_y_continuous(limits=c(0,200),expand=c(0,0)) +
labs(x="Includes Image",y="",title="Average Total Engagement",fill="Includes Image?") +
buffer_theme()
#################################################
## Hashtags
#################################################
hashtag_hist <- ggplot(updates,aes(x=num_hashtags)) +
geom_histogram(color="white",fill="#547c9f",binwidth=1) +
coord_cartesian(xlim=c(0,10)) +
scale_x_continuous(breaks=seq(0,10,1)) +
scale_y_continuous(expand=c(0,0)) +
labs(x="Hashtags", y="Number of Tweets", title="Distribution of Hashtags") +
buffer_theme()
## Group Data by Number of Hashtags
by_hash <- updates %>%
filter(num_followers > 0 & !is.na(engagement_per_follower)) %>%
group_by(num_hashtags) %>%
summarise(tweets=n(),percent=round(n()/750776,3),avg_interactions = mean(interactions), avg_retweets = mean(retweets),
avg_favorites = mean(favorites), avg_mentions = mean(mentions),
avg_clicks = mean(clicks),avg_engagement = mean(engagement),
avg_engagement_per_follower = mean(engagement_per_follower,na.rm=T))
## Plot Average Engagement Per Follower by Number of Hashtags
avg_engagement_hash <- ggplot(filter(by_hash,num_hashtags<11), aes(x=num_hashtags, y=avg_engagement_per_follower)) +
geom_bar(stat="identity",fill="#547c9f") +
labs(x="Hashtags",y="",title="Average Engagement Per Follower",fill="Number of Hashtags") +
buffer_theme() +
scale_y_continuous(limits=c(0,0.0035),expand=c(0,0)) +
scale_x_continuous(breaks=seq(0,10,1))
## Plot Average Total Engagement by Number of Hashtags
avg_engagement_hash <- ggplot(filter(by_hash,num_hashtags<11), aes(x=num_hashtags, y=avg_engagement)) +
geom_bar(stat="identity",fill="#547c9f") +
labs(x="Hashtags",y="",title="Average Total Engagement",fill="Number of Hashtags") +
theme_minimal() +
scale_x_continuous(breaks=seq(0,10,1))
#################################################
## Clean Text Function
#################################################
clean.text <- function(some_txt) {
some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
some_txt = gsub("@\\w+", "", some_txt)
some_txt = gsub("[[:punct:]]", "", some_txt)
some_txt = gsub("[[:digit:]]", "", some_txt)
some_txt = gsub("http\\w+", "", some_txt)
some_txt = gsub("[ \t]{2,}", "", some_txt)
some_txt = gsub("^\\s+|\\s+$", "", some_txt)
some_txt = gsub("amp", "", some_txt)
return(some_txt)
}
#################################################
## ggplot Theme
#################################################
## ggplot Theme
buffer_palette <- c("#3c5a72", "#547c9f", "#6295c0", "#72b0e3")
buffer_theme <- function() {
# Generate the colors for the chart procedurally with RColorBrewer
palette <- brewer.pal("Greys", n=9)
color.background = palette[2]
color.grid.major = palette[3]
color.axis.text = palette[6]
color.axis.title = palette[7]
color.title = palette[9]
# Begin construction of chart
theme_bw(base_size=9) +
# Set the entire chart region to a light gray color
theme(panel.background=element_rect(fill=NA, color=NA)) +
theme(plot.background=element_rect(fill=NA, color=NA)) +
theme(panel.border=element_rect(color=NA)) +
# Format the grid
theme(panel.grid.major=element_line(color=color.grid.major,size=.25)) +
theme(panel.grid.minor=element_blank()) +
theme(axis.ticks=element_blank()) +
theme(panel.grid.major.x = element_blank(),panel.grid.minor.x = element_blank()) +
theme(panel.border = element_blank(), axis.line = element_line(color=color.grid.major)) +
# Format the legend, but hide by default
theme(legend.background = element_rect(fill=NA)) +
theme(legend.text = element_text(size=15,color=color.axis.title)) +
# Set title and axis labels, and format these and tick marks
theme(plot.title=element_text(color=color.title, size=20, vjust=1.25)) +
theme(axis.text.x=element_text(size=10,color=color.axis.text)) +
theme(axis.text.y=element_text(size=10,color=color.axis.text)) +
theme(axis.title.x=element_text(size=15,color=color.axis.title, vjust=0)) +
theme(axis.title.y=element_text(size=15,color=color.axis.title, vjust=1.25)) +
# Plot margins
theme(plot.margin = unit(c(0.35, 0.2, 0.3, 0.35), "cm")) +
theme(text = element_text(size=25))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment