bradlindblad/emotions.R

## emotions.R

library(dplyr)
library(ggplot2)
library(tidytext)
library(tm)
library(wordcloud)


# input: a dataframe called raw, in this case the text col is biggest_concern, and X is the ID col


mydata <- dplyr::select(raw, biggest_concern, X)

 # Split cells of sentences to each word = record
mydata <- dplyr::select(mydata, biggest_concern, X)

# # Split cells of sentences to each word = record
mydata <-stack(tapply(mydata$biggest_concern, mydata$X, function(x) scan(text=x, what=''))) %>%
    dplyr::select(ID = ind, word = values)


mydata <- mydata %>%
  anti_join(stop_words) %>%
  dplyr::mutate(word = str_replace_all(word, "[[:punct:]]", " ")) %>%
  dplyr::mutate(word = tolower(word))


emotions <- mydata %>%
  unnest_tokens(word, word) %>%
  anti_join(stop_words, by = "word") %>%
  filter(!grepl('[0-9]', word)) %>%
  left_join(get_sentiments("nrc"), by = "word") %>%
  filter(!(sentiment == "negative" | sentiment == "positive")) %>%
  group_by(sentiment) %>%
  summarize( freq = n()) %>%
  mutate(percent=round(freq/sum(freq)*100)) %>%
  select(-freq) %>%
  ungroup()


overall_mean_sd <- emotions %>%
     group_by(sentiment) %>%
     summarize(overall_mean=mean(percent), sd=sd(percent))
### draw a bar graph with error bars
ggplot(overall_mean_sd, aes(x = reorder(sentiment, -overall_mean), y=overall_mean)) +
     geom_bar(stat="identity", fill="darkgreen", alpha=0.7) +
     geom_errorbar(aes(ymin=overall_mean-sd, ymax=overall_mean+sd), width=0.2,position=position_dodge(.9)) +
     xlab("Emotion") +
     ylab("Emotion expressed in % of responses") +
     # ggtitle("Emotion words expressed in Mr. Buffett's \n annual shareholder letters (1977 – 2016)") +
     theme(axis.text.x=element_text(angle=45, hjust=1)) +
     coord_flip( ) +
      theme_minimal()

## polarity_wordcloud.R


library(dplyr)
library(tidytext)
library(tm)
library(wordcloud)


# input: a dataframe called raw, in this case the text col is biggest_concern, and X is the ID col


mydata <- dplyr::select(raw, biggest_concern, X)

 # Split cells of sentences to each word = record
mydata <- dplyr::select(mydata, biggest_concern, X)

# # Split cells of sentences to each word = record
mydata <-stack(tapply(mydata$biggest_concern, mydata$X, function(x) scan(text=x, what=''))) %>%
    dplyr::select(ID = ind, word = values)


mydata <- mydata %>%
  anti_join(stop_words) %>%
  dplyr::mutate(word = str_replace_all(word, "[[:punct:]]", " ")) %>%
  dplyr::mutate(word = tolower(word))


mydata %>%
  inner_join(get_sentiments("bing")) %>%
  dplyr::count(word, sentiment, sort=TRUE) %>%
  reshape2::acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("red", "darkgreen"),
                   max.words=40)


## wordcloud.R

library(dplyr)
library(tidytext)
library(tm)
library(wordcloud)


# input: a dataframe called raw, in this case the text col is biggest_concern, and X is the ID col


mydata <- dplyr::select(raw, biggest_concern, X)

 # Split cells of sentences to each word = record
mydata <- dplyr::select(mydata, biggest_concern, X)

# # Split cells of sentences to each word = record
mydata <-stack(tapply(mydata$biggest_concern, mydata$X, function(x) scan(text=x, what=''))) %>%
    dplyr::select(ID = ind, word = values)


mydata <- mydata %>%
  anti_join(stop_words) %>%
  dplyr::mutate(word = str_replace_all(word, "[[:punct:]]", " ")) %>%
  dplyr::mutate(word = tolower(word))


text <- mydata$word
docs <- Corpus(VectorSource(text))
docs <- docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("english"))
dtm <- TermDocumentMatrix(docs)
matrix <- as.matrix(dtm)
words <- sort(rowSums(matrix),decreasing=TRUE)
df <- data.frame(word = names(words),freq=words)
set.seed(1234) # for reproducibility
wordcloud(words = df$word, freq = df$freq, min.freq = 1, max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))

	library(dplyr)
	library(ggplot2)
	library(tidytext)
	library(tm)
	library(wordcloud)


	# input: a dataframe called raw, in this case the text col is biggest_concern, and X is the ID col


	mydata <- dplyr::select(raw, biggest_concern, X)

	# Split cells of sentences to each word = record
	mydata <- dplyr::select(mydata, biggest_concern, X)

	# # Split cells of sentences to each word = record
	mydata <-stack(tapply(mydata$biggest_concern, mydata$X, function(x) scan(text=x, what=''))) %>%
	dplyr::select(ID = ind, word = values)


	mydata <- mydata %>%
	anti_join(stop_words) %>%
	dplyr::mutate(word = str_replace_all(word, "[[:punct:]]", " ")) %>%
	dplyr::mutate(word = tolower(word))



	emotions <- mydata %>%
	unnest_tokens(word, word) %>%
	anti_join(stop_words, by = "word") %>%
	filter(!grepl('[0-9]', word)) %>%
	left_join(get_sentiments("nrc"), by = "word") %>%
	filter(!(sentiment == "negative" \| sentiment == "positive")) %>%
	group_by(sentiment) %>%
	summarize( freq = n()) %>%
	mutate(percent=round(freq/sum(freq)*100)) %>%
	select(-freq) %>%
	ungroup()


	overall_mean_sd <- emotions %>%
	group_by(sentiment) %>%
	summarize(overall_mean=mean(percent), sd=sd(percent))
	### draw a bar graph with error bars
	ggplot(overall_mean_sd, aes(x = reorder(sentiment, -overall_mean), y=overall_mean)) +
	geom_bar(stat="identity", fill="darkgreen", alpha=0.7) +
	geom_errorbar(aes(ymin=overall_mean-sd, ymax=overall_mean+sd), width=0.2,position=position_dodge(.9)) +
	xlab("Emotion") +
	ylab("Emotion expressed in % of responses") +
	# ggtitle("Emotion words expressed in Mr. Buffett's \n annual shareholder letters (1977 – 2016)") +
	theme(axis.text.x=element_text(angle=45, hjust=1)) +
	coord_flip( ) +
	theme_minimal()