Skip to content

Instantly share code, notes, and snippets.

@caitlinhudon
Created April 17, 2024 01:52
Show Gist options
  • Save caitlinhudon/b3a9afe580eb2432a5c9bd09412bab18 to your computer and use it in GitHub Desktop.
Save caitlinhudon/b3a9afe580eb2432a5c9bd09412bab18 to your computer and use it in GitHub Desktop.
text_analysis.py
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import Counter
def generate_ngrams_and_plots(text_input):
# Convert text input to a DataFrame
text_input_df = pd.DataFrame({'text': text_input})
# Tokenize and stem the text
stemmer = SnowballStemmer("english")
text_input_df['tokens'] = text_input_df['text'].apply(word_tokenize)
text_input_df['stemmed'] = text_input_df['tokens'].apply(lambda x: [stemmer.stem(y) for y in x])
# Flatten the list of stemmed tokens and generate n-grams for n=1,2,3,4
all_ngrams = []
for n in range(1, 5):
text_input_df[f'ngram_{n}'] = text_input_df['stemmed'].apply(lambda x: list(ngrams(x, n)))
ngrams_flattened = [item for sublist in text_input_df[f'ngram_{n}'] for item in sublist]
all_ngrams.extend([(n, ' '.join(gram)) for gram in ngrams_flattened])
# Count the frequency of each n-gram
ngrams_freq = Counter(all_ngrams)
# Plot the most frequent n-grams for each n
for n in range(1, 5):
top_ngrams = [gram for gram in ngrams_freq if gram[0] == n]
top_ngrams = sorted(top_ngrams, key=lambda x: ngrams_freq[x], reverse=True)[:10]
labels, values = zip(*[(gram[1], ngrams_freq[gram]) for gram in top_ngrams])
plt.figure(figsize=(10, 6))
plt.bar(labels, values)
plt.title(f'Most frequent {n}-grams')
plt.xticks(rotation=45, ha="right")
plt.xlabel('N-gram')
plt.ylabel('Frequency')
plt.show()
# Generate word clouds for the most frequent terms for each n
for n in range(1, 5):
wordcloud = WordCloud(width=800, height=400, background_color ='white').generate_from_frequencies(dict([(gram[1], ngrams_freq[gram]) for gram in ngrams_freq if gram[0] == n]))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title(f'Word Cloud for {n}-grams')
plt.show()
# Example usage
text_input = ["This is a sample text.", "Another example text.", "Yet another piece of text."]
generate_ngrams_and_plots(text_input)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment