Last active
December 19, 2015 00:29
-
-
Save duhaime/5868981 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import a dictionary of words with positive valence and another dictionary | |
# of words with negative valence. Get access to Twitter. Search Twitter for | |
# tweets that discuss 4 topics. Grab the text from those tweets. Combine | |
# tweets into database. Look for words with positive and negative valence | |
# in those tweets. Subtract words with negative valence from words with | |
# positive valence to get net subjective response to topic of interest. | |
# Create graphs. Code by Gaston Sanchez, edited by Douglas Duhaime | |
library(twitteR) | |
library(plyr) | |
library(stringr) | |
library(ROAuth) | |
library(ggplot2) | |
# import positive and negative words | |
pos = readLines("positive_words.txt") | |
neg = readLines("negative_words.txt") | |
# get login credentials | |
requestURL <- "https://api.twitter.com/oauth/request_token" | |
accessURL = "https://api.twitter.com/oauth/access_token" | |
authURL = "https://api.twitter.com/oauth/authorize" | |
consumerKey = "*******" | |
consumerSecret = "*******" | |
twitCred <- OAuthFactory$new(consumerKey=consumerKey, | |
consumerSecret=consumerSecret, | |
requestURL=requestURL, | |
accessURL=accessURL, | |
authURL=authURL) | |
download.file(url="http://curl.haxx.se/ca/cacert.pem", | |
destfile="cacert.pem") | |
twitCred$handshake(cainfo="cacert.pem") | |
registerTwitterOAuth(twitCred) | |
# tweets with drinks | |
# reduced sample size to take it easy on ol' bessie | |
wine_tweets = searchTwitter("wine", n=50, lang="en", cainfo="cacert.pem") | |
beer_tweets = searchTwitter("beer", n=50, lang="en", cainfo="cacert.pem") | |
cofe_tweets = searchTwitter("coffee", n=50, lang="en", cainfo="cacert.pem") | |
soda_tweets = searchTwitter("soda", n=50, lang="en", cainfo="cacert.pem") | |
# get text | |
wine_txt = sapply(wine_tweets, function(x) x$getText()) | |
beer_txt = sapply(beer_tweets, function(x) x$getText()) | |
cofe_txt = sapply(cofe_tweets, function(x) x$getText()) | |
soda_txt = sapply(soda_tweets, function(x) x$getText()) | |
# how many tweets of each drink | |
nd = c(length(wine_txt), length(beer_txt), length(cofe_txt), length(soda_txt)) | |
# join texts | |
drinks = c(wine_txt, beer_txt, cofe_txt, soda_txt) | |
# strip joined texts of non-standard characters | |
drinks <- Corpus(VectorSource(drinks)) | |
# function score.sentiment | |
score.sentiment = function(sentences, pos.words, neg.words, .progress='none') | |
{ | |
# Parameters | |
# sentences: vector of text to score | |
# pos.words: vector of words of postive sentiment | |
# neg.words: vector of words of negative sentiment | |
# .progress: passed to laply() to control of progress bar | |
# create simple array of scores with laply | |
scores = laply(sentences, | |
function(sentence, pos.words, neg.words) | |
{ | |
# remove punctuation | |
sentence = gsub("[[:punct:]]", "", sentence) | |
# remove control characters | |
sentence = gsub("[[:cntrl:]]", "", sentence) | |
# remove digits? | |
sentence = gsub('\\d+', '', sentence) | |
# define error handling function when trying tolower | |
tryTolower = function(x) | |
{ | |
# create missing value | |
y = NA | |
# tryCatch error | |
try_error = tryCatch(tolower(x), error=function(e) e) | |
# if not an error | |
if (!inherits(try_error, "error")) | |
y = tolower(x) | |
# result | |
return(y) | |
} | |
# use tryTolower with sapply | |
sentence = sapply(sentence, tryTolower) | |
# split sentence into words with str_split (stringr package) | |
word.list = str_split(sentence, "\\s+") | |
words = unlist(word.list) | |
# compare words to the dictionaries of | |
# positive & negative terms | |
pos.matches = match(words, pos.words) | |
neg.matches = match(words, neg.words) | |
# get the position of the matched term or NA | |
# we just want a TRUE/FALSE | |
pos.matches = !is.na(pos.matches) | |
neg.matches = !is.na(neg.matches) | |
# final score | |
score = sum(pos.matches) - sum(neg.matches) | |
return(score) | |
}, pos.words, neg.words, .progress=.progress ) | |
# data frame with scores for each sentence | |
scores.df = data.frame(text=sentences, score=scores) | |
return(scores.df) | |
} | |
# apply score.sentiment | |
scores = score.sentiment(drinks, pos, neg, .progress='text') | |
# add variables to data frame | |
scores$drink = factor(rep(c("wine", "beer", "coffee", "soda"), nd)) | |
scores$very.pos = as.numeric(scores$score >= 2 ) | |
scores$very.neg = as.numeric(scores$score <= -2 ) | |
# how many very positives and very negatives | |
numpos = sum(scores$very.pos) | |
numneg = sum(scores$very.neg) | |
# colors | |
cols = c("#7CAE00", "#00BFC4", "#F8766D", "#C77CFF") | |
names(cols) = c("beer", "coffee", "soda", "wine") | |
# boxplot #Duhaime, changed "opts" method to "labs" | |
ggplot(scores, aes(x=drink, y=score, group=drink)) + | |
geom_boxplot(aes(fill=drink)) + | |
scale_fill_manual(values=cols) + | |
geom_jitter(colour="gray40", | |
position=position_jitter(width=0.2), alpha=0.3) + | |
labs(title = "Boxplot - Drink's Sentiment Scores") | |
# average score | |
meanscore = tapply(scores$score, scores$drink, mean) | |
df = data.frame(drink=names(meanscore), meanscore=meanscore) | |
df$drinks <- reorder(df$drink, df$meanscore) | |
# plot | |
ggplot(df, aes(y=meanscore)) + | |
geom_bar(data=df, aes(x=drinks, fill=drinks)) + | |
scale_fill_manual(values=cols[order(df$meanscore)]) + | |
opts(title = "Average Sentiment Score", | |
legend.position = "none") | |
# average very positive | |
drink_pos = ddply(scores, .(drink), summarise, mean_pos=mean(very.pos)) | |
drink_pos$drinks <- reorder(drink_pos$drink, drink_pos$mean_pos) | |
# plot | |
ggplot(drink_pos, aes(y=mean_pos)) + | |
geom_bar(data=drink_pos, aes(x=drinks, fill=drinks)) + | |
scale_fill_manual(values=cols[order(drink_pos$mean_pos)]) + | |
opts(title = "Average Very Positive Sentiment Score", | |
legend.position = "none") | |
# average very negative | |
drink_neg = ddply(scores, .(drink), summarise, mean_neg=mean(very.neg)) | |
drink_neg$drinks <- reorder(drink_neg$drink, drink_neg$mean_neg) | |
# plot | |
ggplot(drink_neg, aes(y=mean_neg)) + | |
geom_bar(data=drink_neg, aes(x=drinks, fill=drinks)) + | |
scale_fill_manual(values=cols[order(drink_neg$mean_neg)]) + | |
opts(title = "Average Very Negative Sentiment Score", | |
legend.position = "none") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment