Skip to content

Instantly share code, notes, and snippets.

@bradlindblad
Last active November 20, 2020 18:37
Show Gist options
  • Save bradlindblad/313fbee70dd3dbae07a7c9879428d818 to your computer and use it in GitHub Desktop.
Save bradlindblad/313fbee70dd3dbae07a7c9879428d818 to your computer and use it in GitHub Desktop.
R NLP
library(dplyr)
library(ggplot2)
library(tidytext)
library(tm)
library(wordcloud)
# input: a dataframe called raw, in this case the text col is biggest_concern, and X is the ID col
mydata <- dplyr::select(raw, biggest_concern, X)
# Split cells of sentences to each word = record
mydata <- dplyr::select(mydata, biggest_concern, X)
# # Split cells of sentences to each word = record
mydata <-stack(tapply(mydata$biggest_concern, mydata$X, function(x) scan(text=x, what=''))) %>%
dplyr::select(ID = ind, word = values)
mydata <- mydata %>%
anti_join(stop_words) %>%
dplyr::mutate(word = str_replace_all(word, "[[:punct:]]", " ")) %>%
dplyr::mutate(word = tolower(word))
emotions <- mydata %>%
unnest_tokens(word, word) %>%
anti_join(stop_words, by = "word") %>%
filter(!grepl('[0-9]', word)) %>%
left_join(get_sentiments("nrc"), by = "word") %>%
filter(!(sentiment == "negative" | sentiment == "positive")) %>%
group_by(sentiment) %>%
summarize( freq = n()) %>%
mutate(percent=round(freq/sum(freq)*100)) %>%
select(-freq) %>%
ungroup()
overall_mean_sd <- emotions %>%
group_by(sentiment) %>%
summarize(overall_mean=mean(percent), sd=sd(percent))
### draw a bar graph with error bars
ggplot(overall_mean_sd, aes(x = reorder(sentiment, -overall_mean), y=overall_mean)) +
geom_bar(stat="identity", fill="darkgreen", alpha=0.7) +
geom_errorbar(aes(ymin=overall_mean-sd, ymax=overall_mean+sd), width=0.2,position=position_dodge(.9)) +
xlab("Emotion") +
ylab("Emotion expressed in % of responses") +
# ggtitle("Emotion words expressed in Mr. Buffett's \n annual shareholder letters (1977 – 2016)") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
coord_flip( ) +
theme_minimal()
library(dplyr)
library(tidytext)
library(tm)
library(wordcloud)
# input: a dataframe called raw, in this case the text col is biggest_concern, and X is the ID col
mydata <- dplyr::select(raw, biggest_concern, X)
# Split cells of sentences to each word = record
mydata <- dplyr::select(mydata, biggest_concern, X)
# # Split cells of sentences to each word = record
mydata <-stack(tapply(mydata$biggest_concern, mydata$X, function(x) scan(text=x, what=''))) %>%
dplyr::select(ID = ind, word = values)
mydata <- mydata %>%
anti_join(stop_words) %>%
dplyr::mutate(word = str_replace_all(word, "[[:punct:]]", " ")) %>%
dplyr::mutate(word = tolower(word))
mydata %>%
inner_join(get_sentiments("bing")) %>%
dplyr::count(word, sentiment, sort=TRUE) %>%
reshape2::acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("red", "darkgreen"),
max.words=40)
library(dplyr)
library(tidytext)
library(tm)
library(wordcloud)
# input: a dataframe called raw, in this case the text col is biggest_concern, and X is the ID col
mydata <- dplyr::select(raw, biggest_concern, X)
# Split cells of sentences to each word = record
mydata <- dplyr::select(mydata, biggest_concern, X)
# # Split cells of sentences to each word = record
mydata <-stack(tapply(mydata$biggest_concern, mydata$X, function(x) scan(text=x, what=''))) %>%
dplyr::select(ID = ind, word = values)
mydata <- mydata %>%
anti_join(stop_words) %>%
dplyr::mutate(word = str_replace_all(word, "[[:punct:]]", " ")) %>%
dplyr::mutate(word = tolower(word))
text <- mydata$word
docs <- Corpus(VectorSource(text))
docs <- docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("english"))
dtm <- TermDocumentMatrix(docs)
matrix <- as.matrix(dtm)
words <- sort(rowSums(matrix),decreasing=TRUE)
df <- data.frame(word = names(words),freq=words)
set.seed(1234) # for reproducibility
wordcloud(words = df$word, freq = df$freq, min.freq = 1, max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment