Created
November 7, 2017 00:09
-
-
Save rer145/0f41f4d8f5b451f02148da44c563eb05 to your computer and use it in GitHub Desktop.
How to match word sentiments to words with R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(tidytext) | |
library(gutenbergr) | |
library(ggplot2) | |
# Get our data, the text of Frankenstein | |
dracula<-gutenberg_download(345) | |
# Remove the gutenberg_id field since we don't need it | |
dracula$gutenberg_id<-NULL | |
# Using the dim() function we can give each row a line number (15568 total rows) | |
dracula$line<-1:15568 | |
# Split each line of text into words | |
words<-dracula%>% | |
unnest_tokens(word, text) | |
# Typically, we want to analyze the sentiment in chunks of 80 lines. Use the modulo operator to group the words | |
words$grouping<-words$line %/% 80 | |
# The tidytext package has different sentiments available, here we are using bing and afinn. | |
bing<-get_sentiments('bing') | |
afinn<-get_sentiments('afinn') | |
# Now create two different data frames that join the sentiment with the words | |
words_afinn<-inner_join(words, afinn) | |
words_bing<-inner_join(words, bing) | |
# With the bing sentiment, we can assign a +1 or -1 score to positive and negative | |
# afinn already has a score included for the severity of the word | |
words_bing$score<-1 | |
negrows<-which(words_bing$sentiment=='negative') | |
words_bing$score[negrows]<--1 | |
# Summarize each grouping of text to see how positive or negative it is | |
sent_afinn<-words_afinn%>% | |
group_by(grouping)%>% | |
summarize(value=sum(score)) | |
sent_bing<-words_bing%>% | |
group_by(grouping)%>% | |
summarize(value=sum(score)) | |
# Optionally, plot each dataframe to compare the scoring between them | |
ggplot()+ | |
geom_line(data=sent_afinn, aes(x=grouping,y=value), color='orange')+ | |
geom_line(data=sent_bing, aes(x=grouping,y=value), color='black') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment