Skip to content

Instantly share code, notes, and snippets.

@emhart
Created August 6, 2012 08:10
Show Gist options
  • Save emhart/3272216 to your computer and use it in GitHub Desktop.
Save emhart/3272216 to your computer and use it in GitHub Desktop.
Code for ESA text mining
#include libraries
require(twitteR)
require(ggplot2)
require(tm)
require(wordcloud)
require(RColorBrewer)
require(reshape)
# we can only access 1500 tweets without oauth
twit.data <- searchTwitter("#ESA2012",n=1500)
#Extract twitter data
twit.vec <- sapply(twit.data, function(x) x$getText())
# create a corpus
esa.corpus<- Corpus(VectorSource(twit.vec))
esa.corpus <- tm_map(esa.corpus, removePunctuation)
esa.corpus <- tm_map(esa.corpus, tolower)
esa.corpus <- tm_map(esa.corpus, function(x) removeWords(x, c("esa","esaorg","esa2012","#esa2012",stopwords("english"))))
esa.tdm <- TermDocumentMatrix(esa.corpus)
esa.mat <- as.matrix(esa.tdm)
word_freqs <- sort(rowSums(esa.mat), decreasing=TRUE)
dm <- data.frame(word=names(word_freqs), freq=word_freqs)
### Plot your word cloud
wordcloud(dm$word,dm$freq, scale=c(4,.5),min.freq=5,max.words=Inf, random.order=FALSE, rot.per=.15, colors= brewer.pal(8,"Dark2"))
#### Now I'll use some code to extract the user data
d.size <- length(twit.data)
##### Create matrices to hold text output
tweet.df <- data.frame(matrix(NA,ncol=4,nrow=d.size))
for(i in 1:d.size){
tweet.df[i,]<- c(twobj[[i]]$id,twobj[[i]]$screenName,twobj[[i]]$text,as.POSIXct(twobj[[i]]$created,tz="GMT"))
}
colnames(tweet.df) <- c("TweetID","ScreenName","Text","Timestamp")
# Count number of tweets and remove those less than 3
tweet.count <- table(tweet.df$Screen)
tweet.count <- tweet.count[tweet.count > 3]
tc.df <- melt(tweet.count)
colnames(tc.df) <- c("ScreenName","Count")
tc.df[order(tc.df$Count,decreasing=T),]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment