Skip to content

Instantly share code, notes, and snippets.

@jakeybob
Last active July 15, 2024 11:30
Show Gist options
  • Save jakeybob/3e7836ed7e3aea1dcb66db0f3fd7a62b to your computer and use it in GitHub Desktop.
Save jakeybob/3e7836ed7e3aea1dcb66db0f3fd7a62b to your computer and use it in GitHub Desktop.
Wordcloud test
library(tidyverse)
library(ggwordcloud)
library(tidytext)
# https://github.com/lepennec/ggwordcloud
# https://cran.r-project.org/web/packages/tidytext/vignettes/tidytext.html
# get some text, remove newline/return chars
text <- read_file("https://www.gutenberg.org/cache/epub/84/pg84.txt") |>
str_replace_all(pattern = "\r\n|\n|\r", replacement = " ") |>
str_squish()
# put in dataframe, one row per word
df <- tibble(text = text) |>
unnest_tokens(output = word, input = text)
# just combine all the en language stopword dictionaries
stopwords <- get_stopwords("en", "snowball") |>
bind_rows(get_stopwords("en", "smart")) |>
bind_rows(get_stopwords("en", "stopwords-iso")) |>
bind_rows(get_stopwords("en", "marimo")) |>
bind_rows(get_stopwords("en", "nltk")) |>
bind_rows(tibble(word = c("my", "custom", "stopwords"), lexicon = "custom")) |>
group_by(word) |> summarise(lexicon = first(lexicon), .groups = "drop")
# remove stopwords and count remaining word freqs
df_cleaned_word_freqs <- df |>
anti_join(stopwords) |>
count(word) |>
arrange(desc(n))
# plot n most common words in text
n <- 30
max_size <- 15 # size limit of plot, may need tweaking
df_cleaned_word_freqs |>
slice_head(n = n) |>
ggplot(aes(label = word, size = n, colour = n)) +
geom_text_wordcloud() +
scale_size_area(max_size = max_size) +
scale_colour_viridis_c(option = "magma", direction = -1, end = 0.9) +
theme_minimal()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment