Skip to content

Instantly share code, notes, and snippets.

@gsk3
Last active August 29, 2015 14:01
Show Gist options
  • Save gsk3/626694b307a8b01d63b0 to your computer and use it in GitHub Desktop.
Save gsk3/626694b307a8b01d63b0 to your computer and use it in GitHub Desktop.
SAEM 2014 twitterlytics code
# Twitterlytics from SAEM14
setwd( file.path(.db,"misc","SAEMtwitter") )
library(twitteR)
library(tm)
library(wordcloud)
library(RColorBrewer)
library(magrittr)
library(ggplot2)
library(igraph)
# --- Config --- #
# --- Register with twitter --- #
# getTwitterOAuth doesn't work because it doesn't use https
reqURL <- "https://api.twitter.com/oauth/request_token"
accessURL <- "https://api.twitter.com/oauth/access_token"
authURL <- "https://api.twitter.com/oauth/authorize"
twitCred <- OAuthFactory$new(consumerKey= options("twitteRconsumerKey")[[1]],
consumerSecret=options("twitteRconsumerSecret")[[1]],
requestURL=reqURL,
accessURL=accessURL,
authURL=authURL)
twitCred$handshake()
registerTwitterOAuth(twitCred)
# --- Analysis --- #
# -- Extract data -- #
tweet <- searchTwitter( "#SAEM14", n = 5000 )
tweetText <- sapply( tweet, function(x) x$getText() )
tweetText <- gsub( "…", "", tweetText )
# Build data.frame with all metadata
retweeted <- sapply( tweet, function(x) x$getRetweetCount() )
favorited <- sapply( tweet, function(x) x$getFavoriteCount() )
author <- sapply( tweet, function(x) x$getScreenName() )
tweetDFmult <- arrange( data.frame( author = author, tweet = tweetText, retweeted = retweeted, favorited = favorited ), -retweeted )
tweetDF <- tweetDFmult[ !duplicated( tweetDFmult$tweet ), ] # one per tweet, although that makes the true original author ambiguous
write.csv( tweetDF, file="SAEMtweetDF.csv", row.names=FALSE )
# Vector of tweeps in body of tweets
tweepList <- lapply( str_split( tweetText, " " ), str_extract_all, pattern= "@.+" )
tweep <- unlist( tweepList ) %>% sub( ":", "", x = . )
tweepTab <- arrange( as.data.frame(table(tolower(tweep))), -Freq )
colnames(tweepTab) <- c( "tweep", "Freq" )
tweepTab$tweep <- factor( tweepTab$tweep, levels = as.character( tweepTab$tweep ) )
# Network graph of tweeps
tweepDirectedDF <- stack( mapply( tweetDFmult$author, tweepList, FUN = function(auth, recip) {
recip <- sub( ":", "", unlist(recip) )
recip <- sub( "^\\@", "", recip )
if( length(recip)==0 ) recip <- ""
data.frame( from = tolower( auth ), to = tolower( recip ) )
}, SIMPLIFY=FALSE ) )
rownames( tweepDirectedDF ) <- seq(nrow(tweepDirectedDF))
# Text without tweeps
tweetTextNoTweeps <- gsub( "\\@.+? ", "", tweetText, perl=TRUE )
tweetTextNoTweeps <- gsub( "\\@.+?$", "", tweetTextNoTweeps, perl=TRUE )
# Term doc matrix (without tweeps)
saemCorpus <- Corpus(VectorSource(tweetTextNoTweeps))
saemTDM <- TermDocumentMatrix(saemCorpus,
control = list(
removePunctuation = TRUE,
stopwords = c( "saem14", "RT", stopwords("english")),
removeNumbers = FALSE,
tolower = TRUE
)
)
saemTDMmat <- as.matrix( saemTDM )
saemFreq <- sort(rowSums(saemTDMmat), decreasing=TRUE)
saemDM <- data.frame(word=names(saemFreq), freq=saemFreq)
# -- Analyze data -- #
# All people
ggplot( tweepTab, aes(x=tweep, y=Freq) ) + geom_bar(stat="identity")
# Word cloud
with( saemDM, wordcloud(word, freq, random.order=FALSE, colors=brewer.pal(8, "Dark2")) )
# Correlation between retweets and favorites
with( tweetDF, cor( retweeted, favorited ) )
# Plot network graph and analyze
#' Plot network graph
#' @param df The data.frame with columns from and to
#' @param pplDF If not null, the data.frame of tweep, frequency of interactions with person, and categorized frequencies. Used for coloring.
networkPlot <- function( df, pplDF = NULL, ... ) {
vtx <- data.frame( tweep = unique( c( df[,1], df[,2] ) ) )
grph <- graph.data.frame( df, directed = TRUE, vertices = vtx )
V(grph) #prints the list of vertices (people)
E(grph) #prints the list of edges (relationships)
sort( degree(grph) ) #print the number of edges per vertex (relationships per text)
if( is.null(pplDF) ) {
plot( grph, ... )
} else { # Color by number of interactions with person
vtxDF <- merge( vtx, pplDF, all.x=TRUE, all.y=FALSE, sort=FALSE, by = "tweep" )
colorscheme <- rev(grey.colors( length(table(vtxDF$freqCut)) ))
plot( grph, vertex.color = colorscheme[ as.integer( vtxDF$freqCut ) ], ... ) # Assumes that person is the one with the most interactions
text( -1,1 , labels = paste0("Tweets to/from\n @", arrange( vtxDF, -freq )[1,1]) )
with( vtxDF, legend( "topright",
legend = as.character( arrange( with(vtxDF, vtxDF[ !duplicated(freqCut), c("freq","freqCut")] ), freq )$freqCut ) ,
fill = colorscheme,
title = paste("No. of tweets")
) )
}
}
#' Plot the graph emanating from a particular person
#' @param df The data.frame with columns from and to
#' @param person Character: The person
networkPersonPlot <- function( df, person, ... ) {
personDF <- subset( tweepDirectedDF, from==person | to==person )
# Make vertex colors
ppl <- c(personDF[,1],personDF[,2])
#ppl <- ppl[ppl!=person]
pplTab <- sort(table(ppl))
pplCut <- cut( pplTab, breaks = c( 0, unique( quantile(pplTab, c(0,.25,.5,.75, .85, .95,1) ) ) ) )
pplDF <- data.frame( tweep = names(pplTab), freq = pplTab, freqCut = pplCut )
# Plot
networkPlot( personDF, pplDF = pplDF, ... )
}
networkPlot( tweepDirectedDF, vertex.label=NA )
for( p in sub( "^\\@", "", as.character( subset( tweepTab, Freq > 5 )$tweep )) ) {
cat(p,"\n")
png( paste0( "NetworkPersonPlots/NetworkPersonPlot_",p,".png" ) )
try( networkPersonPlot( tweepDirectedDF, p ) )
dev.off()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment