Skip to content

Instantly share code, notes, and snippets.

@jwinternheimer
Last active August 29, 2015 14:18
Show Gist options
  • Save jwinternheimer/6139c3a490da48640bc7 to your computer and use it in GitHub Desktop.
Save jwinternheimer/6139c3a490da48640bc7 to your computer and use it in GitHub Desktop.
library(data.table); library(dplyr); library(tidyr)
library(ggplot2); library(scales); library(grid); library(RColorBrewer)
## Import and Tidy Data
updates <- read.csv("~/Downloads/pst_updates.csv",header=T)
names(updates) <- c("update_id","num_chars","num_explanation","num_hashtags",
"hour_sent","num_followers","follower_tier","link","link_no_content",
"photo","text","interactions","retweets","favorites","mentions",
"clicks","avg_interaction_length")
updates <- updates %>%
mutate(clean_text = clean.text(text)) %>%
mutate(text_length = nchar(as.character(text)),
clean_text_length = nchar(as.character(clean_text)),
engagement = retweets+favorites+mentions)
updates$hour_sent <- as.factor(updates$hour_sent)
updates <- updates %>% mutate(text_length_tier = ifelse(text_length < 20, "T1 (0,20)",
ifelse(text_length < 40, "T2 [20,40)",
ifelse(text_length < 60, "T3 [40,60)",
ifelse(text_length < 80, "T4 [60,80)",
ifelse(text_length < 100, "T5 [80,100)",
ifelse(text_length < 120, "T6 [100,120)",
ifelse(text_length <= 140, "T7 [120,140]","T1 (0,20)"))))))))
updates <- updates %>% mutate(clean_text_length_tier = ifelse(clean_text_length < 20, "T1 (0,20)",
ifelse(clean_text_length < 40, "T2 [20,40)",
ifelse(clean_text_length < 60, "T3 [40,60)",
ifelse(clean_text_length < 80, "T4 [60,80)",
ifelse(clean_text_length < 100, "T5 [80,100)",
ifelse(clean_text_length < 120, "T6 [100,120)",
ifelse(clean_text_length <= 140, "T7 [120,140]","Undefined"))))))))
updates <- updates %>%
mutate(text_length_tier = as.factor(text_length_tier),
clean_text_length_tier = as.factor(clean_text_length_tier))
#################################################
## Tweet Length Plots
#################################################
## Density Plot of Tweet Lengths
tweet_length_density <- ggplot(updates,aes(x=text_length)) + geom_density(color="deepskyblue1",alpha=0.4, fill="deepskyblue1") +
fte_theme() + scale_x_continuous(limits=c(0,140),breaks=seq(0,140,10)) +
labs(x="Tweet Length", y="Density", title="Tweet Length Density")
## Clean Text Length Density
clean_tweet_length_density <- ggplot(updates,aes(x=clean_text_length)) + geom_density(color="deepskyblue1",alpha=0.4, fill="deepskyblue1") +
fte_theme() + scale_x_continuous(limits=c(0,140),breaks=seq(0,140,10)) +
labs(x="Tweet Length", y="Density", title="Clean Tweet Length Density")
## Tweet Length vs. Engagement Scatter Plot
tweet_length_plot <- ggplot(filter(updates,clean_text_length > 0), aes(x=clean_text_length,y=engagement,color=follower_tier)) +
fte_theme() + geom_point(size=1,alpha=0.3,position="jitter") +
scale_x_continuous(limits=c(0,140),breaks=seq(0,140,10)) + ylim(0,250) +
geom_smooth(aes(group=1),size=3) + labs(x="Tweet Length", y="Interactions",title="Tweet Interactions by Character Count")
## Filter Scatter Plot by Follower Tier
follower_tier_filter <- c("T01 [1,100)","T02 [100,500)","T03 [500,1000)","T04 [1000,2000)")
## Scatter Plot Colored by Filtered Follower Tier
filtered_tweet_length <- ggplot(filter(updates,follower_tier %in% follower_tier_filter), aes(x=clean_text_length,y=interactions,color=follower_tier)) +
fte_theme() + geom_point(size=1,alpha=0.3,position="jitter") +
scale_x_continuous(limits=c(0,140),breaks=seq(0,140,10)) +
scale_y_continuous(limits=c(0,30),breaks=seq(0,30,10)) +
geom_smooth(aes(group=1),size=3) +
labs(x="Tweet Length", y="Interactions",title="Tweet Interactions by Character Count")
## Scatter Plot Colored by Link
linked_tweet_length <- ggplot(updates, aes(x=clean_text_length,y=engagement,color=link)) +
fte_theme() + geom_point(size=1,alpha=0.3,position="jitter") +
scale_x_continuous(limits=c(0,140),breaks=seq(0,140,10)) +
scale_y_continuous(limits=c(0,100),breaks=seq(0,100,10)) +
geom_smooth(size=3) +
labs(x="Tweet Length", y="Interactions",title="Tweet Engagement by Character Count")
## Scatter Plot Colored by Image
photo_tweet_length <- ggplot(updates, aes(x=clean_text_length,y=engagement,color=photo)) +
fte_theme() + geom_point(size=1,alpha=0.3,position="jitter") +
scale_x_continuous(limits=c(0,140),breaks=seq(0,140,10)) +
scale_y_continuous(limits=c(0,100),breaks=seq(0,100,10)) +
geom_smooth(size=3) +
labs(x="Tweet Length", y="Interactions",title="Tweet Engagement by Character Count")
## Boxplots
tweet_length_boxplots <- ggplot(updates, aes(x=text_length_tier, y=interactions)) +
geom_boxplot() + fte_theme() + scale_y_continuous(limits=c(0,50),breaks=seq(0,50,10)) +
labs(x="Tweet Length Tier", y="Interaction", title="Interaction by Tweet Length")
filtered_tweet_boxplots <- ggplot(filter(updates,follower_tier %in% follower_tier_filter), aes(x=text_length_tier, y=interactions)) +
geom_boxplot() + fte_theme() + scale_y_continuous(limits=c(0,50),breaks=seq(0,50,10)) +
labs(x="Tweet Length Tier", y="Interaction", title="Interaction by Tweet Length")
## Plot Number of Hashtags vs Interaction Time
hashtags_plot <- ggplot(feb_updates, aes(x=as.factor(num_hashtags), y=avg_interaction_length)) +
geom_boxplot() + fte_theme() + scale_y_continuous(limits=c(0,50),breaks=seq(0,50,10)) +
labs(x="Number of Hashtags", y="Average Interaction Time", title="Interaction Time by Number of Hashtags")
## Plot Number of Hashtags vs Total Interaction
hashtags_interaction_plot <- ggplot(feb_updates, aes(x=as.factor(num_hashtags), y=interactions)) +
geom_boxplot() + fte_theme() + scale_y_continuous(limits=c(0,50),breaks=seq(0,50,10)) +
labs(x="Number of Hashtags", y="Interactions", title="Interaction by Number of Hashtags")
hashtags_interaction_scatter <- ggplot(feb_updates, aes(x=clean_text_length, y=interactions,color=num_hashtags)) +
geom_point(alpha=0.3,size=1,position="jitter") + fte_theme() + scale_y_continuous(limits=c(0,100),breaks=seq(0,100,10)) +
labs(x="Tweet Length", y="Interactions", title="Interaction by Tweet Length and Hashtags")
#################################################17750
## Linear Regression Models
#################################################
mod1 <- lm(interactions ~ num_explanation + num_hashtags + num_followers + link +
link_no_content + with_photo + text_length, data=updates)
mod2 <- lm(interactions ~ num_explanation + num_hashtags + num_followers + link +
with_photo + text_length_tier, data=updates)
follower_tier_filter <- c("T01 [1,100)","T02 [100,500)","T03 [500,1000)","T04 [1000,2000)","T05 [2000,4000)")
mod3 <- lm(interactions ~ num_explanation + num_hashtags + num_followers + link +
with_photo + text_length_tier, data=filter(updates,follower_tier %in% follower_tier_filter))
mod4 <- lm(engagement ~ num_explanation + num_hashtags + num_followers + hour_sent + link +
photo + clean_text_length_tier + hour_sent, data=filter(updates,clean_text_length >0))
mod6 <- lm(engagement ~ num_explanation + num_hashtags + num_followers + link + photo + hour_sent +
clean_text_length_tier, data=filter(updates,follower_tier %in% follower_tier_filter & clean_text_length > 0))
mod7 <- lm(interactions ~ num_explanation + num_hashtags + num_followers + link + photo +
clean_text_length_tier, data=filter(feb_updates,follower_tier %in% follower_tier_filter & clean_text_length > 0))
## Log and Square Root Transformations
updates$logInteractions <- log(updates$interactions)
mod7 <- lm(logInteractions ~ num_explanation + num_hashtags + num_followers + link +with_photo + hour_sent +
clean_text_length_tier, data=filter(updates,follower_tier %in% follower_tier_filter))
updates$rootInteractions <- sqrt(updates$interactions)
mod8 <- lm(rootInteractions ~ num_explanation + num_hashtags + num_followers + link +with_photo + hour_sent +
clean_text_length_tier, data=filter(updates,follower_tier %in% follower_tier_filter))
## Using Retweets and Favorites as Responses
mod9 <- lm(retweets ~ num_explanation + num_hashtags + num_followers + link + photo +
clean_text_length_tier, data=filter(feb_updates,follower_tier %in% follower_tier_filter))
mod10 <- lm(favorites ~ num_explanation + num_hashtags + num_followers + link + photo +
clean_text_length_tier, data=filter(feb_updates,follower_tier %in% follower_tier_filter))
## Regression for Users with < 1000 Followers
mod11 <- lm(interactions ~ num_explanation + num_hashtags + num_followers + link + photo +
clean_text_length_tier, data=filter(feb_updates,follower_tier %in% follower_tier_filter))
#################################################
## Machine Learning Regression
#################################################
library(caret)
inTrain <- createDataPartition(y=feb_updates$interactions, p=0.7, list=F)
trainUpdates <- feb_updates[inTrain,]
testUpdates <- feb_updates[-inTrain,]
modFit1 <- train(interactions ~ num_explanation + num_hashtags + num_followers + link + photo +
clean_text_length_tier, data=trainUpdates, method="lm")
#################################################
## Clean Text Function
#################################################
clean.text <- function(some_txt) {
some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
some_txt = gsub("@\\w+", "", some_txt)
some_txt = gsub("[[:punct:]]", "", some_txt)
some_txt = gsub("[[:digit:]]", "", some_txt)
some_txt = gsub("http\\w+", "", some_txt)
some_txt = gsub("[ \t]{2,}", "", some_txt)
some_txt = gsub("^\\s+|\\s+$", "", some_txt)
some_txt = gsub("amp", "", some_txt)
return(some_txt)
}
#################################################
## ggplot Theme
#################################################
## ggplot Theme
fte_theme <- function() {
# Generate the colors for the chart procedurally with RColorBrewer
palette <- brewer.pal("Greys", n=9)
color.background = palette[2]
color.grid.major = palette[3]
color.axis.text = palette[6]
color.axis.title = palette[7]
color.title = palette[9]
# Begin construction of chart
theme_bw(base_size=9) +
# Set the entire chart region to a light gray color
theme(panel.background=element_rect(fill=color.background, color=color.background)) +
theme(plot.background=element_rect(fill=color.background, color=color.background)) +
theme(panel.border=element_rect(color=color.background)) +
# Format the grid
theme(panel.grid.major=element_line(color=color.grid.major,size=.25)) +
theme(panel.grid.minor=element_blank()) +
theme(axis.ticks=element_blank()) +
theme(panel.grid.major.x = element_blank(),panel.grid.minor.x = element_blank()) +
# Format the legend, but hide by default
theme(legend.background = element_rect(fill=color.background)) +
theme(legend.text = element_text(size=15,color=color.axis.title)) +
# Set title and axis labels, and format these and tick marks
theme(plot.title=element_text(color=color.title, size=20, vjust=1.25)) +
theme(axis.text.x=element_text(size=10,color=color.axis.text)) +
theme(axis.text.y=element_text(size=10,color=color.axis.text)) +
theme(axis.title.x=element_text(size=15,color=color.axis.title, vjust=0)) +
theme(axis.title.y=element_text(size=15,color=color.axis.title, vjust=1.25)) +
# Plot margins
theme(plot.margin = unit(c(0.35, 0.2, 0.3, 0.35), "cm"))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment