Skip to content

Instantly share code, notes, and snippets.

@hbeale
Forked from drewconway/twitter_word_cloud.R
Created June 14, 2011 23:37
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hbeale/1026193 to your computer and use it in GitHub Desktop.
Save hbeale/1026193 to your computer and use it in GitHub Desktop.
R function to create a comparative word cloud of two twitter hashtags, as introduced here http://www.drewconway.com/zia/?p=2624
# File-Name: twitter_word_cloud.R
# Date: 2011-01-30
# Author: Drew Conway
# Email: drew.conway@nyu.edu
# Purpose: Create a comparative word cloud of two twitter hashtags
# Data Used:
# Packages Used: twitteR, tm, ggplot2
# Output File: Hashtag word cloud
# Data Output:
# Machine: Drew Conway's MacBook Pro
# Edit: by HB; wrapped hash.corpus definition in function to robustify against non UTF-8 encoding problems per http://tm.r-forge.r-project.org/faq.html#Encoding
# Copyright (c) 2011, under the Simplified BSD License.
# For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php
# All rights reserved.
# This function takes two twitter hash-tags and creates a comparatice hash-tag as first
# introduced here: http://www.drewconway.com/zia/?p=2624
comparative.wordcloud<-function(hashtag1, hashtag2, file.path, n=100, add.stops=c()) {
### Parameters ###
# hashtag1: First Twitter hashtag to search for (do not include '#' or spaces)
# hashtag2: Second Twitter hashtag to search for (do not include '#' or spaces)
# file.path: File path for saving word cloud (uses ggsave, so file type must be supported)
# by ggplot2. For file types see ?ggsave
# n: The number of tweets to download for each hashtag (default is 100)
# add.stops: Addional stopwords to purge from corpuses
# This supplemental function takes some number as spaces and returns a vertor
# of continuous values for even spacing centered around zero. It is used to
# minimze over-plotting on the y-axis, and maxmize the readability of the plot.
optimal.spacing<-function(spaces) {
if(spaces>1) {
spacing<-1/spaces
if(spaces%%2 > 0) {
lim<-spacing*floor(spaces/2)
return(seq(-lim,lim,spacing))
}
else {
lim<-spacing*(spaces-1)
return(seq(-lim,lim,spacing*2))
}
}
else {
return(0)
}
}
### 0) First check for all required packages, and load
if (!require(twitteR)) install.packages('twitteR', dependencies=TRUE)
library(twitteR)
if (!require(tm)) install.packages('tm', dependencies=TRUE)
library(tm)
if (!require(ggplot2)) install.packages('ggplot2', dependencies=TRUE)
library(ggplot2)
### 1) Download recent tweets with each hashtag and create text Corpus
# with separate documents for text mine from each hashtag
hashtag1<-gsub("[# ]","",hashtag1) # Strip out the '#' and spaces to
hashtag2<-gsub("[# ]","",hashtag2) # make searching work properly
hash1.search<-searchTwitter(paste("#",hashtag1,sep=""),n=n)
hash2.search<-searchTwitter(paste("#",hashtag2,sep=""),n=n)
# Check that the search returned some results
if(class(hash1.search[[1]])!="status") {
warning(paste("No search results returned for: ",hashtag1,sep=""))
stop()
}
else {
if(class(hash2.search[[1]])!="status") {
warning(paste("No search results returned for: ",hashtag2,sep=""))
stop()
}
}
cat("Tweets downloaded\n")
hash1.text<-unique(sapply(hash1.search, statusText)) # Due to retweeting
hash2.text<-unique(sapply(hash2.search, statusText)) # we strip repeats
# Combine texts into a single vector and create a corpus
text.vec<-c(paste(hash1.text, collapse=" "), paste(hash2.text, collapse=" "))
hash.corpus<-tm_map(Corpus(VectorSource(text.vec)), function(x) iconv(enc2utf8(x), sub = "byte"))
### 2) Clean data, create Term-Document matrix, and covert to data frame
add.stops<-c(add.stops,"RT", hashtag1, hashtag2)
hash.control=list(stopwords=c(stopwords(), add.stops), removeNumbers=TRUE, removePunctuation=TRUE)
hash.matrix<-TermDocumentMatrix(hash.corpus, control=hash.control)
# Create data frame from matrix
hash.df<-as.data.frame(inspect(hash.matrix))
names(hash.df)<-c("hash1.freq", "hash2.freq")
hash.df<-subset(hash.df, hash1.freq>0 & hash2.freq>0)
hash.df<-transform(hash.df, freq.dif=hash1.freq-hash2.freq)
cat("Text analysis complete\n")
### 3) Set up data for visualization
# Create separate data frames for each frequency type
hash1.df<-subset(hash.df, freq.dif>0) # Said more often in first hashtag
hash2.df<-subset(hash.df, freq.dif<0) # Said more often in second hashtag
equal.df<-subset(hash.df, freq.dif==0) # Said equally
# Check that there is some overlap
if(nrow(hash1.df) < 1 | nrow(hash2.df) < 1) {
warning("These two hashtags are too dissimialr, there would be no data to plot :(")
stop()
}
# Get spacing for each frequency type
hash1.spacing<-sapply(table(hash1.df$freq.dif), function(x) optimal.spacing(x))
hash2.spacing<-sapply(table(hash2.df$freq.dif), function(x) optimal.spacing(x))
equal.spacing<-sapply(table(equal.df$freq.dif), function(x) optimal.spacing(x))
# Add spacing to data frames
hash1.optim<-rep(0,nrow(hash1.df))
for(n in names(hash1.spacing)) {
hash1.optim[which(hash1.df$freq.dif==as.numeric(n))]<-hash1.spacing[[n]]
}
hash1.df<-transform(hash1.df, Spacing=hash1.optim, Term=row.names(hash1.df))
hash2.optim<-rep(0,nrow(hash2.df))
for(n in names(hash2.spacing)) {
hash2.optim[which(hash2.df$freq.dif==as.numeric(n))]<-hash2.spacing[[n]]
}
hash2.df<-transform(hash2.df, Spacing=hash2.optim, Term=row.names(hash2.df))
equal.df<-transform(equal.df, Spacing=as.vector(equal.spacing), Term=row.names(equal.df))
### 4) Create visualization with ggplot2
# Setup x-axis scaling and labels
x.break.min<-min(hash2.df$freq.dif)
x.break.max<-max(hash1.df$freq.dif)
x.min<-x.break.min-(.1*(min(hash1.df$freq.dif)))
x.max<-x.break.max+(.1*(max(hash1.df$freq.dif)))
x.labs<-c(paste("Tweeted more in #",hashtag2,sep=""),"Tweeted equally",paste("Tweeted more in #",hashtag1,sep=""))
# Create ggplot2 object and save plot
word.cloud<-ggplot(hash1.df, aes(x=freq.dif, y=Spacing))+geom_text(aes(size=hash1.freq, label=Term, colour=freq.dif))+
geom_text(data=hash2.df, aes(x=freq.dif, y=Spacing, label=Term, size=hash2.freq, color=freq.dif))+
geom_text(data=equal.df, aes(x=freq.dif, y=Spacing, label=Term, size=hash1.freq, color=freq.dif))+
scale_size(to=c(3,11), name="Word Frequency")+scale_colour_gradient(low="darkred", high="darkblue", legend=FALSE)+
scale_x_continuous(limits=c(x.min,x.max),breaks=c(x.break.min,0,x.break.max),labels=x.labs)+
scale_y_continuous(breaks=c(0),labels=c(""))+xlab("")+ylab("")+theme_bw()+
opts(panel.grid.major=theme_blank(),panel.grid.minor=theme_blank(), title=paste("Twitter Hashtag Word Cloud 2.0: #",hashtag1," vs. #",hashtag2,sep=""))
ggsave(plot=word.cloud,filename=file.path,width=13,height=7)
cat(paste("Word cloud saved to:",file.path,"\n"))
# Return data in list
return(hash.df)
}
# Example with strataconf and
ht1<-"strataconf" # Hash tags to compare
ht2<-"rstats"
hash.data<-comparative.wordcloud(ht1, ht2, paste(ht1,"_",ht2,".png",sep=""), n=100)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment