Last active
August 29, 2015 14:01
-
-
Save gsk3/626694b307a8b01d63b0 to your computer and use it in GitHub Desktop.
SAEM 2014 twitterlytics code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Twitterlytics from SAEM14 | |
setwd( file.path(.db,"misc","SAEMtwitter") ) | |
library(twitteR) | |
library(tm) | |
library(wordcloud) | |
library(RColorBrewer) | |
library(magrittr) | |
library(ggplot2) | |
library(igraph) | |
# --- Config --- # | |
# --- Register with twitter --- # | |
# getTwitterOAuth doesn't work because it doesn't use https | |
reqURL <- "https://api.twitter.com/oauth/request_token" | |
accessURL <- "https://api.twitter.com/oauth/access_token" | |
authURL <- "https://api.twitter.com/oauth/authorize" | |
twitCred <- OAuthFactory$new(consumerKey= options("twitteRconsumerKey")[[1]], | |
consumerSecret=options("twitteRconsumerSecret")[[1]], | |
requestURL=reqURL, | |
accessURL=accessURL, | |
authURL=authURL) | |
twitCred$handshake() | |
registerTwitterOAuth(twitCred) | |
# --- Analysis --- # | |
# -- Extract data -- # | |
tweet <- searchTwitter( "#SAEM14", n = 5000 ) | |
tweetText <- sapply( tweet, function(x) x$getText() ) | |
tweetText <- gsub( "…", "", tweetText ) | |
# Build data.frame with all metadata | |
retweeted <- sapply( tweet, function(x) x$getRetweetCount() ) | |
favorited <- sapply( tweet, function(x) x$getFavoriteCount() ) | |
author <- sapply( tweet, function(x) x$getScreenName() ) | |
tweetDFmult <- arrange( data.frame( author = author, tweet = tweetText, retweeted = retweeted, favorited = favorited ), -retweeted ) | |
tweetDF <- tweetDFmult[ !duplicated( tweetDFmult$tweet ), ] # one per tweet, although that makes the true original author ambiguous | |
write.csv( tweetDF, file="SAEMtweetDF.csv", row.names=FALSE ) | |
# Vector of tweeps in body of tweets | |
tweepList <- lapply( str_split( tweetText, " " ), str_extract_all, pattern= "@.+" ) | |
tweep <- unlist( tweepList ) %>% sub( ":", "", x = . ) | |
tweepTab <- arrange( as.data.frame(table(tolower(tweep))), -Freq ) | |
colnames(tweepTab) <- c( "tweep", "Freq" ) | |
tweepTab$tweep <- factor( tweepTab$tweep, levels = as.character( tweepTab$tweep ) ) | |
# Network graph of tweeps | |
tweepDirectedDF <- stack( mapply( tweetDFmult$author, tweepList, FUN = function(auth, recip) { | |
recip <- sub( ":", "", unlist(recip) ) | |
recip <- sub( "^\\@", "", recip ) | |
if( length(recip)==0 ) recip <- "" | |
data.frame( from = tolower( auth ), to = tolower( recip ) ) | |
}, SIMPLIFY=FALSE ) ) | |
rownames( tweepDirectedDF ) <- seq(nrow(tweepDirectedDF)) | |
# Text without tweeps | |
tweetTextNoTweeps <- gsub( "\\@.+? ", "", tweetText, perl=TRUE ) | |
tweetTextNoTweeps <- gsub( "\\@.+?$", "", tweetTextNoTweeps, perl=TRUE ) | |
# Term doc matrix (without tweeps) | |
saemCorpus <- Corpus(VectorSource(tweetTextNoTweeps)) | |
saemTDM <- TermDocumentMatrix(saemCorpus, | |
control = list( | |
removePunctuation = TRUE, | |
stopwords = c( "saem14", "RT", stopwords("english")), | |
removeNumbers = FALSE, | |
tolower = TRUE | |
) | |
) | |
saemTDMmat <- as.matrix( saemTDM ) | |
saemFreq <- sort(rowSums(saemTDMmat), decreasing=TRUE) | |
saemDM <- data.frame(word=names(saemFreq), freq=saemFreq) | |
# -- Analyze data -- # | |
# All people | |
ggplot( tweepTab, aes(x=tweep, y=Freq) ) + geom_bar(stat="identity") | |
# Word cloud | |
with( saemDM, wordcloud(word, freq, random.order=FALSE, colors=brewer.pal(8, "Dark2")) ) | |
# Correlation between retweets and favorites | |
with( tweetDF, cor( retweeted, favorited ) ) | |
# Plot network graph and analyze | |
#' Plot network graph | |
#' @param df The data.frame with columns from and to | |
#' @param pplDF If not null, the data.frame of tweep, frequency of interactions with person, and categorized frequencies. Used for coloring. | |
networkPlot <- function( df, pplDF = NULL, ... ) { | |
vtx <- data.frame( tweep = unique( c( df[,1], df[,2] ) ) ) | |
grph <- graph.data.frame( df, directed = TRUE, vertices = vtx ) | |
V(grph) #prints the list of vertices (people) | |
E(grph) #prints the list of edges (relationships) | |
sort( degree(grph) ) #print the number of edges per vertex (relationships per text) | |
if( is.null(pplDF) ) { | |
plot( grph, ... ) | |
} else { # Color by number of interactions with person | |
vtxDF <- merge( vtx, pplDF, all.x=TRUE, all.y=FALSE, sort=FALSE, by = "tweep" ) | |
colorscheme <- rev(grey.colors( length(table(vtxDF$freqCut)) )) | |
plot( grph, vertex.color = colorscheme[ as.integer( vtxDF$freqCut ) ], ... ) # Assumes that person is the one with the most interactions | |
text( -1,1 , labels = paste0("Tweets to/from\n @", arrange( vtxDF, -freq )[1,1]) ) | |
with( vtxDF, legend( "topright", | |
legend = as.character( arrange( with(vtxDF, vtxDF[ !duplicated(freqCut), c("freq","freqCut")] ), freq )$freqCut ) , | |
fill = colorscheme, | |
title = paste("No. of tweets") | |
) ) | |
} | |
} | |
#' Plot the graph emanating from a particular person | |
#' @param df The data.frame with columns from and to | |
#' @param person Character: The person | |
networkPersonPlot <- function( df, person, ... ) { | |
personDF <- subset( tweepDirectedDF, from==person | to==person ) | |
# Make vertex colors | |
ppl <- c(personDF[,1],personDF[,2]) | |
#ppl <- ppl[ppl!=person] | |
pplTab <- sort(table(ppl)) | |
pplCut <- cut( pplTab, breaks = c( 0, unique( quantile(pplTab, c(0,.25,.5,.75, .85, .95,1) ) ) ) ) | |
pplDF <- data.frame( tweep = names(pplTab), freq = pplTab, freqCut = pplCut ) | |
# Plot | |
networkPlot( personDF, pplDF = pplDF, ... ) | |
} | |
networkPlot( tweepDirectedDF, vertex.label=NA ) | |
for( p in sub( "^\\@", "", as.character( subset( tweepTab, Freq > 5 )$tweep )) ) { | |
cat(p,"\n") | |
png( paste0( "NetworkPersonPlots/NetworkPersonPlot_",p,".png" ) ) | |
try( networkPersonPlot( tweepDirectedDF, p ) ) | |
dev.off() | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment