-
-
Save rtcastellano/52550e34f912328e376436fefed6c62c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
```{r wordcloud} | |
library(dplyr) | |
library(ggplot2) | |
library(tm) | |
library(RColorBrewer) | |
library(wordcloud) | |
#Read in data | |
#Data taken from https://www.kaggle.com/snap/amazon-fine-food-reviews | |
reviews = read.csv("Reviews.csv", stringsAsFactors = F) | |
#############################Word clouds#################### | |
#Function to make a word cloud. Can specify the dataframe, the filename of the resulting | |
# wordcloud, the colorschee, and additional words you don't want to appear (common | |
# english stopwords are already removed.) This code is adapted from: | |
#http://www.r-bloggers.com/word-cloud-in-r/ | |
makewordcloud <- function(data, column, filename, colorscheme = "BuGn", extraRemove = NULL) | |
{ | |
data.corpus <- Corpus(DataframeSource(data.frame(data[[column]]))) | |
data.corpus <- tm_map(data.corpus, content_transformer(removePunctuation)) | |
data.corpus <- tm_map(data.corpus, content_transformer(tolower)) | |
data.corpus <- tm_map(data.corpus, content_transformer(function(x) removeWords(x, c(stopwords("english"), | |
extraRemove)))) | |
tdm <- TermDocumentMatrix(data.corpus) | |
m <- as.matrix(tdm) | |
v <- sort(rowSums(m),decreasing=TRUE) | |
d <- data.frame(word = names(v),freq=v) | |
pal <- brewer.pal(9, colorscheme) | |
pal <- pal[-(1:2)] | |
png(filename, width=1280,height=800) | |
wordcloud(d$word,d$freq, scale=c(8,.3),min.freq=2,max.words=100, | |
random.order=T, rot.per=.15, colors=pal, vfont=c("sans serif","plain")) | |
dev.off() | |
} | |
#Positive/negative reviews | |
positive = filter(reviews, Score > 3) | |
negative = filter(reviews, Score < 3) | |
#Sample 35000 rows from positive and negative reviews. The wordcloud function | |
# cannot handle the entire postive and negative dataframes. | |
positivesample = positive[sample(nrow(positive), 35000),] | |
negativesample = negative[sample(nrow(negative), 35000),] | |
#Make wordclouds. | |
makewordcloud(data = positivesample, column = 'Text', | |
filename = 'positiveSampleText.png', colorscheme = 'Greens') | |
makewordcloud(data = negativesample, column = 'Text', | |
filename = 'negativeSampleText.png', colorscheme = 'OrRd') | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment