jakeybob/cloud.R

## cloud.R
library(tidyverse)
library(ggwordcloud)
library(tidytext)

# https://github.com/lepennec/ggwordcloud
# https://cran.r-project.org/web/packages/tidytext/vignettes/tidytext.html

# get some text, remove newline/return chars
text <- read_file("https://www.gutenberg.org/cache/epub/84/pg84.txt") |>
  str_replace_all(pattern = "\r\n|\n|\r", replacement = " ") |>
  str_squish()

# put in dataframe, one row per word
df <- tibble(text = text) |>
  unnest_tokens(output = word, input = text)

# just combine all the en language stopword dictionaries
stopwords <- get_stopwords("en", "snowball") |>
  bind_rows(get_stopwords("en", "smart")) |>
  bind_rows(get_stopwords("en", "stopwords-iso")) |>
  bind_rows(get_stopwords("en", "marimo")) |>
  bind_rows(get_stopwords("en", "nltk")) |>
  bind_rows(tibble(word = c("my", "custom", "stopwords"), lexicon = "custom")) |>
  group_by(word) |> summarise(lexicon = first(lexicon), .groups = "drop")

# remove stopwords and count remaining word freqs
df_cleaned_word_freqs <- df |>
  anti_join(stopwords) |>
  count(word) |>
  arrange(desc(n))

# plot n most common words in text
n <- 30
max_size <- 15 # size limit of plot, may need tweaking

df_cleaned_word_freqs |>
  slice_head(n = n) |>
  ggplot(aes(label = word, size = n, colour = n)) +
  geom_text_wordcloud() +
  scale_size_area(max_size = max_size) +
  scale_colour_viridis_c(option = "magma", direction = -1, end = 0.9) +
  theme_minimal()
	library(tidyverse)
	library(ggwordcloud)
	library(tidytext)

	# https://github.com/lepennec/ggwordcloud
	# https://cran.r-project.org/web/packages/tidytext/vignettes/tidytext.html

	# get some text, remove newline/return chars
	text <- read_file("https://www.gutenberg.org/cache/epub/84/pg84.txt") \|>
	str_replace_all(pattern = "\r\n\|\n\|\r", replacement = " ") \|>
	str_squish()

	# put in dataframe, one row per word
	df <- tibble(text = text) \|>
	unnest_tokens(output = word, input = text)

	# just combine all the en language stopword dictionaries
	stopwords <- get_stopwords("en", "snowball") \|>
	bind_rows(get_stopwords("en", "smart")) \|>
	bind_rows(get_stopwords("en", "stopwords-iso")) \|>
	bind_rows(get_stopwords("en", "marimo")) \|>
	bind_rows(get_stopwords("en", "nltk")) \|>
	bind_rows(tibble(word = c("my", "custom", "stopwords"), lexicon = "custom")) \|>
	group_by(word) \|> summarise(lexicon = first(lexicon), .groups = "drop")

	# remove stopwords and count remaining word freqs
	df_cleaned_word_freqs <- df \|>
	anti_join(stopwords) \|>
	count(word) \|>
	arrange(desc(n))

	# plot n most common words in text
	n <- 30
	max_size <- 15 # size limit of plot, may need tweaking

	df_cleaned_word_freqs \|>
	slice_head(n = n) \|>
	ggplot(aes(label = word, size = n, colour = n)) +
	geom_text_wordcloud() +
	scale_size_area(max_size = max_size) +
	scale_colour_viridis_c(option = "magma", direction = -1, end = 0.9) +
	theme_minimal()