Skip to content

Instantly share code, notes, and snippets.

@caitlinhudon
Created April 17, 2024 01:51
Show Gist options
  • Save caitlinhudon/f8b1d0533279d0791788da3cd121c078 to your computer and use it in GitHub Desktop.
Save caitlinhudon/f8b1d0533279d0791788da3cd121c078 to your computer and use it in GitHub Desktop.
text_analysis.R
library(tidytext)
library(dplyr)
library(tokenizers)
library(ggplot2)
library(wordcloud)
# Function to stem generic text input, generate n-grams, and plot results
generate_ngrams_and_plots <- function(text_input) {
# Convert text input to a tibble
text_input_df <- tibble(text = text_input)
# Tokenize and stem the text
tokens <- text_input_df %>%
unnest_tokens(word, text) %>%
mutate(word = SnowballC::wordStem(word))
# Generate n-grams for n=1,2,3,4 and combine into a single data frame
ngrams_df <- bind_rows(lapply(1:4, function(n) {
tokens %>%
unnest_tokens(input = word, output = paste("ngram", n, sep="_"), token = "ngrams", n = n) %>%
mutate(n = n)
}))
# Count the frequency of each n-gram
ngrams_freq <- ngrams_df %>%
count(n, value = paste("ngram", n, sep="_"), name = "ngram", sort = TRUE)
# Plot the most frequent n-grams for each n
ngrams_freq %>%
group_by(n) %>%
top_n(10, n) %>%
ungroup() %>%
ggplot(aes(x = reorder(ngram, n), y = n)) +
geom_bar(stat = "identity") +
facet_wrap(~n, scales = "free") +
xlab("N-gram") +
ylab("Frequency") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Generate word clouds for the most frequent terms for each n
lapply(unique(ngrams_freq$n), function(n) {
subset_ngrams <- subset(ngrams_freq, n == n)
wordcloud(words = subset_ngrams$ngram, freq = subset_ngrams$n, min.freq = 1,
max.words = 100, random.order = FALSE, rot.per = 0.35,
colors = brewer.pal(8, "Dark2"))
})
}
# Example usage
text_input <- c("example_document.txt", "sample_file.pdf", "test_script.py")
generate_ngrams_and_plots(text_input)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment