Skip to content

Instantly share code, notes, and snippets.

@bohdanszymanik
Last active December 19, 2015 12:38
Show Gist options
  • Save bohdanszymanik/5955779 to your computer and use it in GitHub Desktop.
Save bohdanszymanik/5955779 to your computer and use it in GitHub Desktop.
R Tweets Example
# download tweets using twitter search api, combine text and count +ve -ve words to get sentiment
# minor modifications to the code taken from
# http://jeffreybreen.wordpress.com/2011/07/04/twitter-text-mining-r-slides/
library(RCurl)
library(twitteR)
library(ROAuth)
requestURL <- "https://api.twitter.com/oauth/request_token"
accessURL = "http://api.twitter.com/oauth/access_token"
authURL = "http://api.twitter.com/oauth/authorize"
consumerKey = "someKey"
consumerSecret = "someSecret"
Cred <- OAuthFactory$new(consumerKey=consumerKey,
consumerSecret=consumerSecret,
requestURL=requestURL,
accessURL=accessURL,
authURL=authURL)
#The next command provides a URL which you will need to copy and paste into your favourite browser
#Assuming you are logged into Twitter you will then be provided a PIN number to type into the R command line
# This doesn't work with RStudio at the time of writing - use RGui and once done, save the Credentials - from that time on
# you can use RStudio (thank goodness)
Cred$handshake(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl") )
# Checks that you are authorised
registerTwitterOAuth(Cred)
# from http://stackoverflow.com/questions/9916283/twitter-roauth-and-windows-register-ok-but-certificate-verify-failed
# save to reuse credentials in future
save(list="Cred", file="twitteR_credentials")
load("twitteR_credentials")
registerTwitterOAuth(Cred)
# notice you need to add the full path to the cacert file - at least I did...
va.tweets = searchTwitter('@virginatlantic', n = 1500, cainfo="C:\\Users\\Bohdan\\Documents\\R\\win-library\\3.0\\RCurl\\CurlSSL\\cacert.pem")
aa.tweets = searchTwitter('@AmericanAir', n = 1500, cainfo="C:\\Users\\Bohdan\\Documents\\R\\win-library\\3.0\\RCurl\\CurlSSL\\cacert.pem")
library(plyr)
va.text = laply(va.tweets, function(t) t$getText() )
aa.text = laply(aa.tweets, function(t) t$getText() )
# hmmm, R dosen't like funny characters so strip em out
va.text = gsub("[^[:alnum:]|^[:space:]]", "", va.text)
aa.text = gsub("[^[:alnum:]|^[:space:]]", "", aa.text)
hu.liu.pos.words = scan('c:\\wd\\positive-words.txt', what='character', comment.char=';')
hu.liu.neg.words = scan('c:\\wd\\negative-words.txt', what='character', comment.char=';')
# some terms I've noticed that could be substituted in:
# lift your game
# i am unable
# oh gee thats just fantastic
# site down again
# still waiting on response
# add in emoticons
pos.words = c(hu.liu.pos.words, 'smiley')
neg.words = c(hu.liu.neg.words, 'unsmiley')
substitute.emoticons = function(text) {
smiled <- gsub(':\\)', 'smiley', text)
return(gsub(':\\(', 'unsmiley', smiled))
}
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
require(plyr)
require(stringr)
# we got a vector of sentences. plyr will handle a list
# or a vector as an "l" for us
# we want a simple array ("a") of scores back, so we use
# "l" + "a" + "ply" = "laply":
scores = laply(sentences, function(sentence, pos.words, neg.words) {
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
}, pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
test = c("you're awesome and I love you :)", "you're awful and I hate you. This is terrible service.", "I'm both impressed and amazed")
result = score.sentiment(substitute.emoticons(test), pos.words, neg.words)
# hmmm, I've had to clean the punctuation characters out - there were some errors from bnz tweets
# they didn't have to do this in the original...
va.scores = score.sentiment(substitute.emoticons(va.text), pos.words, neg.words, .progress='text')
aa.scores = score.sentiment(substitute.emoticons(aa.text), pos.words, neg.words, .progress='text')
library(ggplot2)
# add identifying variable for each bank
va.scores["Company"] <- "Virgin Atlantic" # one way
aa.scores$Company = 'American Airlines' # another
# combine scores and plot stacked
all.scores = rbind( aa.scores, va.scores)
ggplot(data = all.scores) +
geom_bar(mapping = aes(x=score, fill=Company), binwidth=1 ) +
facet_grid(Company~.) +
theme_bw()
# let's explore word associations using tm: http://rdatamining.wordpress.com/2011/11/09/using-text-mining-to-find-out-what-rdatamining-tweets-are-about/
library(tm)
# build corpus
cp <- Corpus(VectorSource(all.scores$text))
# clean
cp <- tm_map(cp, tolower)
cp <- tm_map(cp, removePunctuation)
cp <- tm_map(cp, removeNumbers)
# tm includes stopwords lists for languages but it misses some important common examples
customStopwords <- c(stopwords('english'), "available", "via")
# feel free to view stopwords for some other languages eg dutch, french, italian, german
cp <- tm_map(cp, removeWords, customStopwords)
# stem if necessary but you get some oddities eg mining stems to mine
# requires packages Snowball, RWeka, rJava, RWekajars
# example of what it can do for better or worse: stemDocument(c("mining", "miners"))
#cp <- tm_map(cp, stemDocument)
# build the term document matrix
td <- TermDocumentMatrix(cp, control = list(minWordLength=1))
# we have a LOT of sparse terms... let's get rid of them
tds <- removeSparseTerms(td, 0.99)
# what's getting mentioned most?
findFreqTerms(tds, lowfreq = 10)
# let's uncover some associations
findAssocs(tds, 'awesome', 0.05)
library(wordcloud)
m <- as.matrix(tds)
v <- sort(rowSums(m), decreasing=TRUE)
cloudNames <- names(v)
d <- data.frame(word=cloudNames, freq=v)
wordcloud(d$word, d$freq, min.freq=10)
inspect(tds[1:10,1:10])
# let's examine term adjacency and plot a graph of connectivity: http://rdatamining.wordpress.com/2012/05/17/an-example-of-social-network-analysis-with-r-using-package-igraph/
# transform sparse termDocumentMatrix to an actual matrix, make boolean, then transform to adjacency matrix
tdsm <- as.matrix(tds)
tdsm[tdsm>=1] <- 1
termMatrix <- tdsm %*% t(tdsm)
library(igraph)
g <- graph.adjacency(termMatrix, weighted=TRUE, mode="undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
plot(g, layout=layout.kamada.kawai)
tkplot(g, layout=layout.kamada.kawai)
V(g)$label.cex <- 2.2 * V(g)$degree / max(V(g)$degree)+ .2
V(g)$label.color <- rgb(0, 0, .2, .8)
V(g)$frame.color <- NA
egam <- (log(E(g)$weight)+.4) / max(log(E(g)$weight)+.4)
E(g)$color <- rgb(.5, .5, 0, egam)
E(g)$width <- egam
plot(g, layout=layout1)
tkplot(g, layout=layout1)
plot(termMatrix, terms = findFreqTerms(termMatrix, lowfreq = 5), corThreshold = 0.5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment