Created
March 9, 2016 18:36
-
-
Save sriyoda/aec02c9c32936432ecf3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pos.words = scan(file.path('opinion-lexicon-English', 'positive-words.txt'), what='character', comment.char=';') | |
neg.words = scan(file.path('opinion-lexicon-English', 'negative-words.txt'), what='character', comment.char=';') | |
score.sentiment = function(sentences, pos.words, neg.words, .progress='none') { | |
# we got a vector of sentences. plyr will handle a list or a vector as an "l" for us | |
scores = laply(sentences, function(sentence, pos.words, neg.words) { | |
# clean up sentences with R's regex-driven global substitute, gsub(): | |
sentence = gsub('[[:punct:]]', '', sentence) | |
sentence = gsub('[[:cntrl:]]', '', sentence) | |
sentence = gsub('\\d+', '', sentence) | |
# and convert to lower case: | |
sentence = tolower(sentence) | |
# split into words | |
word.list = str_split(sentence, '\\s+') | |
# sometimes a list() is one level of hierarchy too much | |
words = unlist(word.list) | |
# compare our words to the dictionaries of positive & negative lexicons | |
pos.matches = match(words, pos.words) | |
neg.matches = match(words, neg.words) | |
# we just want a TRUE/FALSE: | |
pos.matches = !is.na(pos.matches) | |
neg.matches = !is.na(neg.matches) | |
# TRUE/FALSE will be treated as 1/0 by sum(): | |
score = sum(pos.matches) - sum(neg.matches) | |
return(score) | |
}, pos.words, neg.words, .progress=.progress ) | |
scores.df = data.frame(score=scores, text=sentences) | |
return(scores.df) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment