duhaime/twitter_sentiment_classification_in_R

## twitter_sentiment_classification_in_R
# Import a dictionary of words with positive valence and another dictionary
# of words with negative valence. Get access to Twitter. Search Twitter for
# tweets that discuss 4 topics. Grab the text from those tweets. Combine
# tweets into database. Look for words with positive and negative valence
# in those tweets. Subtract words with negative valence from words with
# positive valence to get net subjective response to topic of interest.
# Create graphs. Code by Gaston Sanchez, edited by Douglas Duhaime

library(twitteR)
library(plyr)
library(stringr)
library(ROAuth)
library(ggplot2)

# import positive and negative words
pos = readLines("positive_words.txt")
neg = readLines("negative_words.txt")

# get login credentials
requestURL <-  "https://api.twitter.com/oauth/request_token"
accessURL =    "https://api.twitter.com/oauth/access_token"
authURL =      "https://api.twitter.com/oauth/authorize"
consumerKey =   "*******"
consumerSecret = "*******"
twitCred <- OAuthFactory$new(consumerKey=consumerKey,
                             consumerSecret=consumerSecret,
                             requestURL=requestURL,
                             accessURL=accessURL,
                             authURL=authURL)
download.file(url="http://curl.haxx.se/ca/cacert.pem",
              destfile="cacert.pem")
twitCred$handshake(cainfo="cacert.pem")
registerTwitterOAuth(twitCred)

# tweets with drinks
# reduced sample size to take it easy on ol' bessie
wine_tweets = searchTwitter("wine", n=50, lang="en", cainfo="cacert.pem")
beer_tweets = searchTwitter("beer", n=50, lang="en", cainfo="cacert.pem")
cofe_tweets = searchTwitter("coffee", n=50, lang="en", cainfo="cacert.pem")
soda_tweets = searchTwitter("soda", n=50, lang="en", cainfo="cacert.pem")

# get text
wine_txt = sapply(wine_tweets, function(x) x$getText())
beer_txt = sapply(beer_tweets, function(x) x$getText())
cofe_txt = sapply(cofe_tweets, function(x) x$getText())
soda_txt = sapply(soda_tweets, function(x) x$getText())

# how many tweets of each drink
nd = c(length(wine_txt), length(beer_txt), length(cofe_txt), length(soda_txt))

# join texts
drinks = c(wine_txt, beer_txt, cofe_txt, soda_txt)

# strip joined texts of non-standard characters
drinks <- Corpus(VectorSource(drinks))

# function score.sentiment
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
  # Parameters
  # sentences: vector of text to score
  # pos.words: vector of words of postive sentiment
  # neg.words: vector of words of negative sentiment
  # .progress: passed to laply() to control of progress bar

  # create simple array of scores with laply
    scores = laply(sentences,
                   function(sentence, pos.words, neg.words)
                   {
                     # remove punctuation
                     sentence = gsub("[[:punct:]]", "", sentence)
                     # remove control characters
                     sentence = gsub("[[:cntrl:]]", "", sentence)
                     # remove digits?
                     sentence = gsub('\\d+', '', sentence)

                     # define error handling function when trying tolower
                     tryTolower = function(x)
                     {
                       # create missing value
                       y = NA
                       # tryCatch error
                       try_error = tryCatch(tolower(x), error=function(e) e)
                       # if not an error
                       if (!inherits(try_error, "error"))
                         y = tolower(x)
                       # result
                       return(y)
                     }
                     # use tryTolower with sapply
                     sentence = sapply(sentence, tryTolower)

                     # split sentence into words with str_split (stringr package)
                     word.list = str_split(sentence, "\\s+")
                     words = unlist(word.list)

                     # compare words to the dictionaries of
                     # positive & negative terms
                     pos.matches = match(words, pos.words)
                     neg.matches = match(words, neg.words)

                     # get the position of the matched term or NA
                     # we just want a TRUE/FALSE
                     pos.matches = !is.na(pos.matches)
                     neg.matches = !is.na(neg.matches)

                     # final score
                     score = sum(pos.matches) - sum(neg.matches)
                     return(score)
                   }, pos.words, neg.words, .progress=.progress )

    # data frame with scores for each sentence
    scores.df = data.frame(text=sentences, score=scores)
    return(scores.df)
  }


# apply score.sentiment
scores = score.sentiment(drinks, pos, neg, .progress='text')

# add variables to data frame
scores$drink = factor(rep(c("wine", "beer", "coffee", "soda"), nd))
scores$very.pos = as.numeric(scores$score >= 2 )
scores$very.neg = as.numeric(scores$score <= -2 )

# how many very positives and very negatives
numpos = sum(scores$very.pos)
numneg = sum(scores$very.neg)

# colors
cols = c("#7CAE00", "#00BFC4", "#F8766D", "#C77CFF")
names(cols) = c("beer", "coffee", "soda", "wine")

# boxplot #Duhaime, changed "opts" method to "labs"
ggplot(scores, aes(x=drink, y=score, group=drink)) +
  geom_boxplot(aes(fill=drink)) +
  scale_fill_manual(values=cols) +
  geom_jitter(colour="gray40",
              position=position_jitter(width=0.2), alpha=0.3) +
  labs(title = "Boxplot - Drink's Sentiment Scores")

# average score
meanscore = tapply(scores$score, scores$drink, mean)
df = data.frame(drink=names(meanscore), meanscore=meanscore)
df$drinks <- reorder(df$drink, df$meanscore)
# plot
ggplot(df, aes(y=meanscore)) +
  geom_bar(data=df, aes(x=drinks, fill=drinks)) +
  scale_fill_manual(values=cols[order(df$meanscore)]) +
  opts(title = "Average Sentiment Score",
       legend.position = "none")

# average very positive
drink_pos = ddply(scores, .(drink), summarise, mean_pos=mean(very.pos))
drink_pos$drinks <- reorder(drink_pos$drink, drink_pos$mean_pos)
# plot
ggplot(drink_pos, aes(y=mean_pos)) +
  geom_bar(data=drink_pos, aes(x=drinks, fill=drinks)) +
  scale_fill_manual(values=cols[order(drink_pos$mean_pos)]) +
  opts(title = "Average Very Positive Sentiment Score",
       legend.position = "none")

# average very negative
drink_neg = ddply(scores, .(drink), summarise, mean_neg=mean(very.neg))
drink_neg$drinks <- reorder(drink_neg$drink, drink_neg$mean_neg)
# plot
ggplot(drink_neg, aes(y=mean_neg)) +
  geom_bar(data=drink_neg, aes(x=drinks, fill=drinks)) +
  scale_fill_manual(values=cols[order(drink_neg$mean_neg)]) +
  opts(title = "Average Very Negative Sentiment Score",
       legend.position = "none")
	# Import a dictionary of words with positive valence and another dictionary
	# of words with negative valence. Get access to Twitter. Search Twitter for
	# tweets that discuss 4 topics. Grab the text from those tweets. Combine
	# tweets into database. Look for words with positive and negative valence
	# in those tweets. Subtract words with negative valence from words with
	# positive valence to get net subjective response to topic of interest.
	# Create graphs. Code by Gaston Sanchez, edited by Douglas Duhaime

	library(twitteR)
	library(plyr)
	library(stringr)
	library(ROAuth)
	library(ggplot2)

	# import positive and negative words
	pos = readLines("positive_words.txt")
	neg = readLines("negative_words.txt")

	# get login credentials
	requestURL <- "https://api.twitter.com/oauth/request_token"
	accessURL = "https://api.twitter.com/oauth/access_token"
	authURL = "https://api.twitter.com/oauth/authorize"
	consumerKey = "*******"
	consumerSecret = "*******"
	twitCred <- OAuthFactory$new(consumerKey=consumerKey,
	consumerSecret=consumerSecret,
	requestURL=requestURL,
	accessURL=accessURL,
	authURL=authURL)
	download.file(url="http://curl.haxx.se/ca/cacert.pem",
	destfile="cacert.pem")
	twitCred$handshake(cainfo="cacert.pem")
	registerTwitterOAuth(twitCred)

	# tweets with drinks
	# reduced sample size to take it easy on ol' bessie
	wine_tweets = searchTwitter("wine", n=50, lang="en", cainfo="cacert.pem")
	beer_tweets = searchTwitter("beer", n=50, lang="en", cainfo="cacert.pem")
	cofe_tweets = searchTwitter("coffee", n=50, lang="en", cainfo="cacert.pem")
	soda_tweets = searchTwitter("soda", n=50, lang="en", cainfo="cacert.pem")

	# get text
	wine_txt = sapply(wine_tweets, function(x) x$getText())
	beer_txt = sapply(beer_tweets, function(x) x$getText())
	cofe_txt = sapply(cofe_tweets, function(x) x$getText())
	soda_txt = sapply(soda_tweets, function(x) x$getText())

	# how many tweets of each drink
	nd = c(length(wine_txt), length(beer_txt), length(cofe_txt), length(soda_txt))

	# join texts
	drinks = c(wine_txt, beer_txt, cofe_txt, soda_txt)

	# strip joined texts of non-standard characters
	drinks <- Corpus(VectorSource(drinks))

	# function score.sentiment
	score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
	{
	# Parameters
	# sentences: vector of text to score
	# pos.words: vector of words of postive sentiment
	# neg.words: vector of words of negative sentiment
	# .progress: passed to laply() to control of progress bar

	# create simple array of scores with laply
	scores = laply(sentences,
	function(sentence, pos.words, neg.words)
	{
	# remove punctuation
	sentence = gsub("[[:punct:]]", "", sentence)
	# remove control characters
	sentence = gsub("[[:cntrl:]]", "", sentence)
	# remove digits?
	sentence = gsub('\\d+', '', sentence)

	# define error handling function when trying tolower
	tryTolower = function(x)
	{
	# create missing value
	y = NA
	# tryCatch error
	try_error = tryCatch(tolower(x), error=function(e) e)
	# if not an error
	if (!inherits(try_error, "error"))
	y = tolower(x)
	# result
	return(y)
	}
	# use tryTolower with sapply
	sentence = sapply(sentence, tryTolower)

	# split sentence into words with str_split (stringr package)
	word.list = str_split(sentence, "\\s+")
	words = unlist(word.list)

	# compare words to the dictionaries of
	# positive & negative terms
	pos.matches = match(words, pos.words)
	neg.matches = match(words, neg.words)

	# get the position of the matched term or NA
	# we just want a TRUE/FALSE
	pos.matches = !is.na(pos.matches)
	neg.matches = !is.na(neg.matches)

	# final score
	score = sum(pos.matches) - sum(neg.matches)
	return(score)
	}, pos.words, neg.words, .progress=.progress )

	# data frame with scores for each sentence
	scores.df = data.frame(text=sentences, score=scores)
	return(scores.df)
	}


	# apply score.sentiment
	scores = score.sentiment(drinks, pos, neg, .progress='text')

	# add variables to data frame
	scores$drink = factor(rep(c("wine", "beer", "coffee", "soda"), nd))
	scores$very.pos = as.numeric(scores$score >= 2 )
	scores$very.neg = as.numeric(scores$score <= -2 )

	# how many very positives and very negatives
	numpos = sum(scores$very.pos)
	numneg = sum(scores$very.neg)

	# colors
	cols = c("#7CAE00", "#00BFC4", "#F8766D", "#C77CFF")
	names(cols) = c("beer", "coffee", "soda", "wine")

	# boxplot #Duhaime, changed "opts" method to "labs"
	ggplot(scores, aes(x=drink, y=score, group=drink)) +
	geom_boxplot(aes(fill=drink)) +
	scale_fill_manual(values=cols) +
	geom_jitter(colour="gray40",
	position=position_jitter(width=0.2), alpha=0.3) +
	labs(title = "Boxplot - Drink's Sentiment Scores")

	# average score
	meanscore = tapply(scores$score, scores$drink, mean)
	df = data.frame(drink=names(meanscore), meanscore=meanscore)
	df$drinks <- reorder(df$drink, df$meanscore)
	# plot
	ggplot(df, aes(y=meanscore)) +
	geom_bar(data=df, aes(x=drinks, fill=drinks)) +
	scale_fill_manual(values=cols[order(df$meanscore)]) +
	opts(title = "Average Sentiment Score",
	legend.position = "none")

	# average very positive
	drink_pos = ddply(scores, .(drink), summarise, mean_pos=mean(very.pos))
	drink_pos$drinks <- reorder(drink_pos$drink, drink_pos$mean_pos)
	# plot
	ggplot(drink_pos, aes(y=mean_pos)) +
	geom_bar(data=drink_pos, aes(x=drinks, fill=drinks)) +
	scale_fill_manual(values=cols[order(drink_pos$mean_pos)]) +
	opts(title = "Average Very Positive Sentiment Score",
	legend.position = "none")

	# average very negative
	drink_neg = ddply(scores, .(drink), summarise, mean_neg=mean(very.neg))
	drink_neg$drinks <- reorder(drink_neg$drink, drink_neg$mean_neg)
	# plot
	ggplot(drink_neg, aes(y=mean_neg)) +
	geom_bar(data=drink_neg, aes(x=drinks, fill=drinks)) +
	scale_fill_manual(values=cols[order(drink_neg$mean_neg)]) +
	opts(title = "Average Very Negative Sentiment Score",
	legend.position = "none")