caitlinhudon/text_analysis.R

## text_analysis.R
library(tidytext)
library(dplyr)
library(tokenizers)
library(ggplot2)
library(wordcloud)

# Function to stem generic text input, generate n-grams, and plot results
generate_ngrams_and_plots <- function(text_input) {
  # Convert text input to a tibble
  text_input_df <- tibble(text = text_input)

  # Tokenize and stem the text
  tokens <- text_input_df %>%
    unnest_tokens(word, text) %>%
    mutate(word = SnowballC::wordStem(word))

  # Generate n-grams for n=1,2,3,4 and combine into a single data frame
  ngrams_df <- bind_rows(lapply(1:4, function(n) {
    tokens %>%
      unnest_tokens(input = word, output = paste("ngram", n, sep="_"), token = "ngrams", n = n) %>%
      mutate(n = n)
  }))

  # Count the frequency of each n-gram
  ngrams_freq <- ngrams_df %>%
    count(n, value = paste("ngram", n, sep="_"), name = "ngram", sort = TRUE)

  # Plot the most frequent n-grams for each n
  ngrams_freq %>%
    group_by(n) %>%
    top_n(10, n) %>%
    ungroup() %>%
    ggplot(aes(x = reorder(ngram, n), y = n)) +
    geom_bar(stat = "identity") +
    facet_wrap(~n, scales = "free") +
    xlab("N-gram") +
    ylab("Frequency") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

  # Generate word clouds for the most frequent terms for each n
  lapply(unique(ngrams_freq$n), function(n) {
    subset_ngrams <- subset(ngrams_freq, n == n)
    wordcloud(words = subset_ngrams$ngram, freq = subset_ngrams$n, min.freq = 1,
              max.words = 100, random.order = FALSE, rot.per = 0.35,
              colors = brewer.pal(8, "Dark2"))
  })
}

# Example usage
text_input <- c("example_document.txt", "sample_file.pdf", "test_script.py")
generate_ngrams_and_plots(text_input)
	library(tidytext)
	library(dplyr)
	library(tokenizers)
	library(ggplot2)
	library(wordcloud)

	# Function to stem generic text input, generate n-grams, and plot results
	generate_ngrams_and_plots <- function(text_input) {
	# Convert text input to a tibble
	text_input_df <- tibble(text = text_input)

	# Tokenize and stem the text
	tokens <- text_input_df %>%
	unnest_tokens(word, text) %>%
	mutate(word = SnowballC::wordStem(word))

	# Generate n-grams for n=1,2,3,4 and combine into a single data frame
	ngrams_df <- bind_rows(lapply(1:4, function(n) {
	tokens %>%
	unnest_tokens(input = word, output = paste("ngram", n, sep="_"), token = "ngrams", n = n) %>%
	mutate(n = n)
	}))

	# Count the frequency of each n-gram
	ngrams_freq <- ngrams_df %>%
	count(n, value = paste("ngram", n, sep="_"), name = "ngram", sort = TRUE)

	# Plot the most frequent n-grams for each n
	ngrams_freq %>%
	group_by(n) %>%
	top_n(10, n) %>%
	ungroup() %>%
	ggplot(aes(x = reorder(ngram, n), y = n)) +
	geom_bar(stat = "identity") +
	facet_wrap(~n, scales = "free") +
	xlab("N-gram") +
	ylab("Frequency") +
	theme_minimal() +
	theme(axis.text.x = element_text(angle = 45, hjust = 1))

	# Generate word clouds for the most frequent terms for each n
	lapply(unique(ngrams_freq$n), function(n) {
	subset_ngrams <- subset(ngrams_freq, n == n)
	wordcloud(words = subset_ngrams$ngram, freq = subset_ngrams$n, min.freq = 1,
	max.words = 100, random.order = FALSE, rot.per = 0.35,
	colors = brewer.pal(8, "Dark2"))
	})
	}

	# Example usage
	text_input <- c("example_document.txt", "sample_file.pdf", "test_script.py")
	generate_ngrams_and_plots(text_input)