mkulakowski2/sentiment_analysis_hotel.r

## sentiment_analysis_hotel.r
# load twitter library
library(twitteR)

# search for all the hilton tweets
hilton.tweets=searchTwitter('@hilton',n=1500)

length(hilton.tweets)
class(hilton.tweets)
tweet=hilton.tweets[[1]]
class(tweet)
tweet$getScreenName()
tweet$getText()

library("plyr")
hilton.text=laply(hilton.tweets,function(t)t$getText())
length(hilton.text)
head(hilton.text,5)

# load list of positive and negative words for SIMPLE sentiment analysis
# you would have to download the files from a website I included below - make sure you put in the directory that you will be
# referencing
hu.liu.pos=scan('/Users/marcinkulakowski/r/hotel/positive-words.txt',what='character',comment.char=';')
hu.liu.neg=scan('/Users/marcinkulakowski/r/hotel/negative-words.txt',what='character',comment.char=';')
pos.words=c(hu.liu.pos,'upgrade')
neg.words=c(hu.liu.neg,'wtf','wait','waiting','epicfail','mechanical')

# sampling
sample=c("You'reawesomeandIloveyou","Ihateandhateandhate.Soangry.Die!","Impressedandamazed:youarepeerlessinyourachievementofunparalleledmediocrity.")

score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
  require(plyr)
  require(stringr)

  # we got a vector of sentences. plyr will handle a list
  # or a vector as an "l" for us
  # we want a simple array ("a") of scores back, so we use
  # "l" + "a" + "ply" = "laply":
  scores = laply(sentences, function(sentence, pos.words, neg.words) {

    # clean up sentences with R's regex-driven global substitute, gsub():
    sentence = gsub('[[:punct:]]', '', sentence)
    sentence = gsub('[[:cntrl:]]', '', sentence)
    sentence = gsub('\\d+', '', sentence)
    # and convert to lower case:
    sentence = tolower(sentence)

    # split into words. str_split is in the stringr package
    word.list = str_split(sentence, '\\s+')
    # sometimes a list() is one level of hierarchy too much
    words = unlist(word.list)

    # compare our words to the dictionaries of positive & negative terms
    pos.matches = match(words, pos.words)
    neg.matches = match(words, neg.words)

    # match() returns the position of the matched term or NA
    # we just want a TRUE/FALSE:
    pos.matches = !is.na(pos.matches)
    neg.matches = !is.na(neg.matches)

    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    score = sum(pos.matches) - sum(neg.matches)

    return(score)
  }, pos.words, neg.words, .progress=.progress )

  scores.df = data.frame(score=scores, text=sentences)
  return(scores.df)
}

result=score.sentiment(sample,pos.words,neg.words)
class(result)
result$score
hilton.scores=score.sentiment(hilton.text,pos.words,neg.words,.progress='text')
hilton.scores$hotel='Hilton'
hilton.scores$code='HL'
hist(hilton.scores$score)

# hilton histogram
library("ggplot2")
qplot(hilton.scores$score)
# qplot hilton
# lets search for all other major hotels

# Intercontinental
intercontinental.tweets=searchTwitter('@intercontinental',n=1500)
class(tweet)
intercontinental.text=laply(intercontinental.tweets,function(t)t$getText())
intercontinental.scores=score.sentiment(intercontinental.text,pos.words,neg.words,.progress='text')
intercontinental.scores$hotel='Intercontinental'
intercontinental.scores$code='IC'

# Wyndham
wyndham.tweets=searchTwitter('@wyndham',n=1500)
class(tweet)
wyndham.text=laply(wyndham.tweets,function(t)t$getText())
wyndham.scores=score.sentiment(wyndham.text,pos.words,neg.words,.progress='text')
wyndham.scores$hotel='Wyndham'
wyndham.scores$code='WY'

# Marriott
marriott.tweets=searchTwitter('@marriott',n=1500)
class(tweet)
marriott.text=laply(marriott.tweets,function(t)t$getText())
marriott.scores=score.sentiment(marriott.text,pos.words,neg.words,.progress='text')
marriott.scores$hotel='Marriott'
marriott.scores$code='MI'

# BestWestern
bestwestern.tweets=searchTwitter('@bestwestern',n=1500)
class(tweet)
bestwestern.text=laply(bestwestern.tweets,function(t)t$getText())
bestwestern.scores=score.sentiment(bestwestern.text,pos.words,neg.words,.progress='text')
bestwestern.scores$hotel='Bestwestern'
bestwestern.scores$code='BW'

# Starwood
starwood.tweets=searchTwitter('@starwood',n=1500)
class(tweet)
starwood.text=laply(starwood.tweets,function(t)t$getText())
starwood.scores=score.sentiment(starwood.text,pos.words,neg.words,.progress='text')
starwood.scores$hotel='Starwood'
starwood.scores$code='SW'

# Hyatt
hyatt.tweets=searchTwitter('@hyatt',n=1500)
class(tweet)
hyatt.text=laply(hyatt.tweets,function(t)t$getText())
hyatt.scores=score.sentiment(hyatt.text,pos.words,neg.words,.progress='text')
hyatt.scores$hotel='Hyatt'
hyatt.scores$code='HY'

all.scores=rbind(intercontinental.scores,wyndham.scores,hilton.scores,marriott.scores,bestwestern.scores,starwood.scores,hyatt.scores)
# Make separate plot for each hotel

ggplot(data=all.scores)+#ggplotworksondata.frames,always
  geom_bar(mapping=aes(x=score,fill=hotel),binwidth=1)+
  facet_grid(hotel~.)+#makeaseparateplotforeachhotel
  theme_bw()+scale_fill_brewer()#plaindisplay,nicercolors

# Plot
all.scores$very.pos=as.numeric(all.scores$score>=2)
all.scores$very.neg=as.numeric(all.scores$score twitter.df=ddply(all.scores,c('hotel','code'),summarise,pos.count=sum(very.pos),neg.count=sum(very.neg))

twitter.df$all.count=twitter.df$pos.count+twitter.df$neg.count
twitter.df$score=round(100*twitter.df$pos.count/twitter.df$all.count)
install.packages("doBy")
library("doBy")
orderBy(~-score,twitter.df)

hotel code pos.count neg.count all.count score

install.packages("XML")
library(XML)
acsi.url='http://www.theacsi.org/index.php?option=com_content&view=article&id=147&catid=&Itemid=212&i=Hotels'
# scrape acsi website for scores
acsi.df=readHTMLTable(acsi.url,header=T,which=1,stringsAsFactors=F)
acsi.df=acsi.df[,c(1,18)]
head(acsi.df,1)
colnames(acsi.df)=c('hotel','score')
acsi.df$score=as.numeric(acsi.df$score)
View(acsi.df)

acsi.df$code=c('HL','SW','MI','NA','HY','NA','IC','BW','NA','WY','NA','NA','NA')
acsi.df$score=as.numeric(acsi.df$score)
compare.df=merge(twitter.df,acsi.df,by='code',suffixes=c('.twitter','.acsi'))
compare.df=subset(compare.df,all.count>100)
compare.df=merge(twitter.df,acsi.df,by='code',suffixes=c('.twitter','.acsi'))

View(compare.df)

ggplot(compare.df)+geom_point(aes(x=score.twitter,y=score.acsi,color=hotel.twitter),size=6)+ geom_smooth(aes(x=score.twitter,y=score.acsi,group=1),se=F,method="lm")+theme_bw()+opts(legend.position=c(0.85,0.85))
	# load twitter library
	library(twitteR)

	# search for all the hilton tweets
	hilton.tweets=searchTwitter('@hilton',n=1500)

	length(hilton.tweets)
	class(hilton.tweets)
	tweet=hilton.tweets[[1]]
	class(tweet)
	tweet$getScreenName()
	tweet$getText()

	library("plyr")
	hilton.text=laply(hilton.tweets,function(t)t$getText())
	length(hilton.text)
	head(hilton.text,5)

	# load list of positive and negative words for SIMPLE sentiment analysis
	# you would have to download the files from a website I included below - make sure you put in the directory that you will be
	# referencing
	hu.liu.pos=scan('/Users/marcinkulakowski/r/hotel/positive-words.txt',what='character',comment.char=';')
	hu.liu.neg=scan('/Users/marcinkulakowski/r/hotel/negative-words.txt',what='character',comment.char=';')
	pos.words=c(hu.liu.pos,'upgrade')
	neg.words=c(hu.liu.neg,'wtf','wait','waiting','epicfail','mechanical')

	# sampling
	sample=c("You'reawesomeandIloveyou","Ihateandhateandhate.Soangry.Die!","Impressedandamazed:youarepeerlessinyourachievementofunparalleledmediocrity.")

	score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
	{
	require(plyr)
	require(stringr)

	# we got a vector of sentences. plyr will handle a list
	# or a vector as an "l" for us
	# we want a simple array ("a") of scores back, so we use
	# "l" + "a" + "ply" = "laply":
	scores = laply(sentences, function(sentence, pos.words, neg.words) {

	# clean up sentences with R's regex-driven global substitute, gsub():
	sentence = gsub('[[:punct:]]', '', sentence)
	sentence = gsub('[[:cntrl:]]', '', sentence)
	sentence = gsub('\\d+', '', sentence)
	# and convert to lower case:
	sentence = tolower(sentence)

	# split into words. str_split is in the stringr package
	word.list = str_split(sentence, '\\s+')
	# sometimes a list() is one level of hierarchy too much
	words = unlist(word.list)

	# compare our words to the dictionaries of positive & negative terms
	pos.matches = match(words, pos.words)
	neg.matches = match(words, neg.words)

	# match() returns the position of the matched term or NA
	# we just want a TRUE/FALSE:
	pos.matches = !is.na(pos.matches)
	neg.matches = !is.na(neg.matches)

	# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
	score = sum(pos.matches) - sum(neg.matches)

	return(score)
	}, pos.words, neg.words, .progress=.progress )

	scores.df = data.frame(score=scores, text=sentences)
	return(scores.df)
	}

	result=score.sentiment(sample,pos.words,neg.words)
	class(result)
	result$score
	hilton.scores=score.sentiment(hilton.text,pos.words,neg.words,.progress='text')
	hilton.scores$hotel='Hilton'
	hilton.scores$code='HL'
	hist(hilton.scores$score)

	# hilton histogram
	library("ggplot2")
	qplot(hilton.scores$score)
	# qplot hilton
	# lets search for all other major hotels

	# Intercontinental
	intercontinental.tweets=searchTwitter('@intercontinental',n=1500)
	class(tweet)
	intercontinental.text=laply(intercontinental.tweets,function(t)t$getText())
	intercontinental.scores=score.sentiment(intercontinental.text,pos.words,neg.words,.progress='text')
	intercontinental.scores$hotel='Intercontinental'
	intercontinental.scores$code='IC'

	# Wyndham
	wyndham.tweets=searchTwitter('@wyndham',n=1500)
	class(tweet)
	wyndham.text=laply(wyndham.tweets,function(t)t$getText())
	wyndham.scores=score.sentiment(wyndham.text,pos.words,neg.words,.progress='text')
	wyndham.scores$hotel='Wyndham'
	wyndham.scores$code='WY'

	# Marriott
	marriott.tweets=searchTwitter('@marriott',n=1500)
	class(tweet)
	marriott.text=laply(marriott.tweets,function(t)t$getText())
	marriott.scores=score.sentiment(marriott.text,pos.words,neg.words,.progress='text')
	marriott.scores$hotel='Marriott'
	marriott.scores$code='MI'

	# BestWestern
	bestwestern.tweets=searchTwitter('@bestwestern',n=1500)
	class(tweet)
	bestwestern.text=laply(bestwestern.tweets,function(t)t$getText())
	bestwestern.scores=score.sentiment(bestwestern.text,pos.words,neg.words,.progress='text')
	bestwestern.scores$hotel='Bestwestern'
	bestwestern.scores$code='BW'

	# Starwood
	starwood.tweets=searchTwitter('@starwood',n=1500)
	class(tweet)
	starwood.text=laply(starwood.tweets,function(t)t$getText())
	starwood.scores=score.sentiment(starwood.text,pos.words,neg.words,.progress='text')
	starwood.scores$hotel='Starwood'
	starwood.scores$code='SW'

	# Hyatt
	hyatt.tweets=searchTwitter('@hyatt',n=1500)
	class(tweet)
	hyatt.text=laply(hyatt.tweets,function(t)t$getText())
	hyatt.scores=score.sentiment(hyatt.text,pos.words,neg.words,.progress='text')
	hyatt.scores$hotel='Hyatt'
	hyatt.scores$code='HY'

	all.scores=rbind(intercontinental.scores,wyndham.scores,hilton.scores,marriott.scores,bestwestern.scores,starwood.scores,hyatt.scores)
	# Make separate plot for each hotel

	ggplot(data=all.scores)+#ggplotworksondata.frames,always
	geom_bar(mapping=aes(x=score,fill=hotel),binwidth=1)+
	facet_grid(hotel~.)+#makeaseparateplotforeachhotel
	theme_bw()+scale_fill_brewer()#plaindisplay,nicercolors

	# Plot
	all.scores$very.pos=as.numeric(all.scores$score>=2)
	all.scores$very.neg=as.numeric(all.scores$score twitter.df=ddply(all.scores,c('hotel','code'),summarise,pos.count=sum(very.pos),neg.count=sum(very.neg))

	twitter.df$all.count=twitter.df$pos.count+twitter.df$neg.count
	twitter.df$score=round(100*twitter.df$pos.count/twitter.df$all.count)
	install.packages("doBy")
	library("doBy")
	orderBy(~-score,twitter.df)

	hotel code pos.count neg.count all.count score

	install.packages("XML")
	library(XML)
	acsi.url='http://www.theacsi.org/index.php?option=com_content&view=article&id=147&catid=&Itemid=212&i=Hotels'
	# scrape acsi website for scores
	acsi.df=readHTMLTable(acsi.url,header=T,which=1,stringsAsFactors=F)
	acsi.df=acsi.df[,c(1,18)]
	head(acsi.df,1)
	colnames(acsi.df)=c('hotel','score')
	acsi.df$score=as.numeric(acsi.df$score)
	View(acsi.df)

	acsi.df$code=c('HL','SW','MI','NA','HY','NA','IC','BW','NA','WY','NA','NA','NA')
	acsi.df$score=as.numeric(acsi.df$score)
	compare.df=merge(twitter.df,acsi.df,by='code',suffixes=c('.twitter','.acsi'))
	compare.df=subset(compare.df,all.count>100)
	compare.df=merge(twitter.df,acsi.df,by='code',suffixes=c('.twitter','.acsi'))

	View(compare.df)

	ggplot(compare.df)+geom_point(aes(x=score.twitter,y=score.acsi,color=hotel.twitter),size=6)+ geom_smooth(aes(x=score.twitter,y=score.acsi,group=1),se=F,method="lm")+theme_bw()+opts(legend.position=c(0.85,0.85))