caitlinhudon/text_analysis.py

## text_analysis.py
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import Counter

def generate_ngrams_and_plots(text_input):
    # Convert text input to a DataFrame
    text_input_df = pd.DataFrame({'text': text_input})

    # Tokenize and stem the text
    stemmer = SnowballStemmer("english")
    text_input_df['tokens'] = text_input_df['text'].apply(word_tokenize)
    text_input_df['stemmed'] = text_input_df['tokens'].apply(lambda x: [stemmer.stem(y) for y in x])

    # Flatten the list of stemmed tokens and generate n-grams for n=1,2,3,4
    all_ngrams = []
    for n in range(1, 5):
        text_input_df[f'ngram_{n}'] = text_input_df['stemmed'].apply(lambda x: list(ngrams(x, n)))
        ngrams_flattened = [item for sublist in text_input_df[f'ngram_{n}'] for item in sublist]
        all_ngrams.extend([(n, ' '.join(gram)) for gram in ngrams_flattened])

    # Count the frequency of each n-gram
    ngrams_freq = Counter(all_ngrams)

    # Plot the most frequent n-grams for each n
    for n in range(1, 5):
        top_ngrams = [gram for gram in ngrams_freq if gram[0] == n]
        top_ngrams = sorted(top_ngrams, key=lambda x: ngrams_freq[x], reverse=True)[:10]
        labels, values = zip(*[(gram[1], ngrams_freq[gram]) for gram in top_ngrams])

        plt.figure(figsize=(10, 6))
        plt.bar(labels, values)
        plt.title(f'Most frequent {n}-grams')
        plt.xticks(rotation=45, ha="right")
        plt.xlabel('N-gram')
        plt.ylabel('Frequency')
        plt.show()

    # Generate word clouds for the most frequent terms for each n
    for n in range(1, 5):
        wordcloud = WordCloud(width=800, height=400, background_color ='white').generate_from_frequencies(dict([(gram[1], ngrams_freq[gram]) for gram in ngrams_freq if gram[0] == n]))
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.title(f'Word Cloud for {n}-grams')
        plt.show()

# Example usage
text_input = ["This is a sample text.", "Another example text.", "Yet another piece of text."]
generate_ngrams_and_plots(text_input)
	import pandas as pd
	import matplotlib.pyplot as plt
	from wordcloud import WordCloud
	from sklearn.feature_extraction.text import CountVectorizer
	from nltk.stem.snowball import SnowballStemmer
	from nltk.tokenize import word_tokenize
	from nltk.util import ngrams
	from collections import Counter

	def generate_ngrams_and_plots(text_input):
	# Convert text input to a DataFrame
	text_input_df = pd.DataFrame({'text': text_input})

	# Tokenize and stem the text
	stemmer = SnowballStemmer("english")
	text_input_df['tokens'] = text_input_df['text'].apply(word_tokenize)
	text_input_df['stemmed'] = text_input_df['tokens'].apply(lambda x: [stemmer.stem(y) for y in x])

	# Flatten the list of stemmed tokens and generate n-grams for n=1,2,3,4
	all_ngrams = []
	for n in range(1, 5):
	text_input_df[f'ngram_{n}'] = text_input_df['stemmed'].apply(lambda x: list(ngrams(x, n)))
	ngrams_flattened = [item for sublist in text_input_df[f'ngram_{n}'] for item in sublist]
	all_ngrams.extend([(n, ' '.join(gram)) for gram in ngrams_flattened])

	# Count the frequency of each n-gram
	ngrams_freq = Counter(all_ngrams)

	# Plot the most frequent n-grams for each n
	for n in range(1, 5):
	top_ngrams = [gram for gram in ngrams_freq if gram[0] == n]
	top_ngrams = sorted(top_ngrams, key=lambda x: ngrams_freq[x], reverse=True)[:10]
	labels, values = zip(*[(gram[1], ngrams_freq[gram]) for gram in top_ngrams])

	plt.figure(figsize=(10, 6))
	plt.bar(labels, values)
	plt.title(f'Most frequent {n}-grams')
	plt.xticks(rotation=45, ha="right")
	plt.xlabel('N-gram')
	plt.ylabel('Frequency')
	plt.show()

	# Generate word clouds for the most frequent terms for each n
	for n in range(1, 5):
	wordcloud = WordCloud(width=800, height=400, background_color ='white').generate_from_frequencies(dict([(gram[1], ngrams_freq[gram]) for gram in ngrams_freq if gram[0] == n]))
	plt.figure(figsize=(10, 5))
	plt.imshow(wordcloud, interpolation='bilinear')
	plt.axis("off")
	plt.title(f'Word Cloud for {n}-grams')
	plt.show()

	# Example usage
	text_input = ["This is a sample text.", "Another example text.", "Yet another piece of text."]
	generate_ngrams_and_plots(text_input)