trinker/quanteda_wordcloud.R

## quanteda_wordcloud.R
## Load dependencies
library(quanteda)
library(sentimentr)
library(tidyverse)
library(lexicon)
## Data set from sentimentr package
dat <- presidential_debates_2012
dat

corp <- corpus(dat, text_field = "dialogue")
stopwords <- c(sw_fry_100, function_words, c('obama', 'because', 'romney', 'going', 'our', 'president'))

# basic wordcloud for Romey
dfmat1 <- dfm(
    corpus_subset(corp, person == "ROMNEY"), ## get subsets of the original data for Romney
    remove = stopwords,  ## remove overly common words
    remove_numbers = TRUE,
    remove_punct = TRUE,
    remove_url = TRUE,
    remove_symbols = TRUE,
    stem = FALSE
) %>%
   dfm_trim(min_termfreq = 8)  #how frequently a temr must show up

par(xpd=F)
textplot_wordcloud(dfmat1, color = rev(RColorBrewer::brewer.pal(10, "RdBu")))
title("Romeny", adj=1, line=1, font=2, col.main = 'orange')

# basic wordcloud for Obama
dfmat2 <- dfm(
    corpus_subset(corp, person == "OBAMA"), ## get subsets of the original data for Romney
    remove = stopwords,  ## remove overly common words
    remove_numbers = TRUE,
    remove_punct = TRUE,
    remove_url = TRUE,
    remove_symbols = TRUE,
    stem = FALSE
) %>%
   dfm_trim(min_termfreq = 8)  #how frequently a temr must show up

dev.new()
par(xpd=F)
textplot_wordcloud(dfmat2, color = rev(RColorBrewer::brewer.pal(10, "RdBu")))
title("Obama", adj=1, line=1, font=2, col.main = 'orange')


## Now a little more complicated.
## Loop through each person and time and plot out the words.
## Stem along the way

subset_dat <- dat %>%
    filter(person %in% c('ROMNEY', 'OBAMA'))
subs <- subset_dat %>%
    select(person, time) %>%
    distinct() %>%
    mutate(
        title = paste(time, person, sep = ': '),
        across(everything(), as.character)
    )

corp2 <- corpus(subset_dat, text_field = "dialogue")
for (i in seq_len(nrow(subs))) {


    # basic wordcloud for Obama
    dfmat <- dfm(
        corpus_subset(corp2, person == subs[['person']][i] & time == subs[['time']][i]), ## get subsets of the original data for Romney
        remove = stopwords,  ## remove overly common words
        remove_numbers = TRUE,
        remove_punct = TRUE,
        remove_url = TRUE,
        remove_symbols = TRUE,
        stem = TRUE
    ) %>%
       dfm_trim(min_termfreq = 5)  #how frequently a term must show up

    dev.new()
    par(xpd=F)
    textplot_wordcloud(dfmat, color = rev(RColorBrewer::brewer.pal(10, "RdBu")))
    title(subs[['title']][i], adj=1, line=1, font=2, col.main = 'orange')
}
	## Load dependencies
	library(quanteda)
	library(sentimentr)
	library(tidyverse)
	library(lexicon)
	## Data set from sentimentr package
	dat <- presidential_debates_2012
	dat

	corp <- corpus(dat, text_field = "dialogue")
	stopwords <- c(sw_fry_100, function_words, c('obama', 'because', 'romney', 'going', 'our', 'president'))

	# basic wordcloud for Romey
	dfmat1 <- dfm(
	corpus_subset(corp, person == "ROMNEY"), ## get subsets of the original data for Romney
	remove = stopwords, ## remove overly common words
	remove_numbers = TRUE,
	remove_punct = TRUE,
	remove_url = TRUE,
	remove_symbols = TRUE,
	stem = FALSE
	) %>%
	dfm_trim(min_termfreq = 8) #how frequently a temr must show up

	par(xpd=F)
	textplot_wordcloud(dfmat1, color = rev(RColorBrewer::brewer.pal(10, "RdBu")))
	title("Romeny", adj=1, line=1, font=2, col.main = 'orange')

	# basic wordcloud for Obama
	dfmat2 <- dfm(
	corpus_subset(corp, person == "OBAMA"), ## get subsets of the original data for Romney
	remove = stopwords, ## remove overly common words
	remove_numbers = TRUE,
	remove_punct = TRUE,
	remove_url = TRUE,
	remove_symbols = TRUE,
	stem = FALSE
	) %>%
	dfm_trim(min_termfreq = 8) #how frequently a temr must show up

	dev.new()
	par(xpd=F)
	textplot_wordcloud(dfmat2, color = rev(RColorBrewer::brewer.pal(10, "RdBu")))
	title("Obama", adj=1, line=1, font=2, col.main = 'orange')


	## Now a little more complicated.
	## Loop through each person and time and plot out the words.
	## Stem along the way

	subset_dat <- dat %>%
	filter(person %in% c('ROMNEY', 'OBAMA'))
	subs <- subset_dat %>%
	select(person, time) %>%
	distinct() %>%
	mutate(
	title = paste(time, person, sep = ': '),
	across(everything(), as.character)
	)

	corp2 <- corpus(subset_dat, text_field = "dialogue")
	for (i in seq_len(nrow(subs))) {


	# basic wordcloud for Obama
	dfmat <- dfm(
	corpus_subset(corp2, person == subs[['person']][i] & time == subs[['time']][i]), ## get subsets of the original data for Romney
	remove = stopwords, ## remove overly common words
	remove_numbers = TRUE,
	remove_punct = TRUE,
	remove_url = TRUE,
	remove_symbols = TRUE,
	stem = TRUE
	) %>%
	dfm_trim(min_termfreq = 5) #how frequently a term must show up

	dev.new()
	par(xpd=F)
	textplot_wordcloud(dfmat, color = rev(RColorBrewer::brewer.pal(10, "RdBu")))
	title(subs[['title']][i], adj=1, line=1, font=2, col.main = 'orange')
	}