Skip to content

Instantly share code, notes, and snippets.

@MaxGSEO
Created October 22, 2023 23:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MaxGSEO/011e5c6106ffbcda21cb88db33fafbc4 to your computer and use it in GitHub Desktop.
Save MaxGSEO/011e5c6106ffbcda21cb88db33fafbc4 to your computer and use it in GitHub Desktop.
Python script that focuses solely on TF-IDF and n-gram extraction. It reads text from 10 files, performs the TF-IDF analysis, and then exports the results to an Excel sheet, including the n-gram, the files in which it appears, its count, and its frequency.
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
import os
# Read text from files
texts = {}
for filename in os.listdir('./texts/'): # Assuming the files are in a folder called 'texts'
if filename.endswith('.txt'):
with open(f'./texts/{filename}', 'r') as f:
texts[filename] = f.read()
# Generate TF-IDF features
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 5), stop_words=None)
tfidf_matrix = tfidf_vectorizer.fit_transform(texts.values())
feature_names = tfidf_vectorizer.get_feature_names_out()
# Filter out n-grams that start or end with a stopword
english_stopwords = set(stopwords.words('english'))
italian_stopwords = set(stopwords.words('italian'))
filtered_feature_names = [feature for feature in feature_names if feature.split()[0] not in english_stopwords and feature.split()[-1] not in english_stopwords and feature.split()[0] not in italian_stopwords and feature.split()[-1] not in italian_stopwords]
# Filter out n-grams that contain the same word twice
filtered_feature_names = [feature for feature in filtered_feature_names if len(set(feature.split())) == len(feature.split())]
# Counting n-gram occurrence and frequency
ngram_counts = Counter()
ngram_file_mapping = {}
for filename, text in texts.items():
for feature in filtered_feature_names:
count = text.count(feature)
if count > 0:
ngram_counts[feature] += count
if feature in ngram_file_mapping:
ngram_file_mapping[feature].append(filename)
else:
ngram_file_mapping[feature] = [filename]
# Calculating frequency
total_ngrams = sum(ngram_counts.values())
ngram_frequency = {ngram: count / total_ngrams for ngram, count in ngram_counts.items()}
# Creating a DataFrame
df = pd.DataFrame({
'N-gram': list(ngram_counts.keys()),
'Count': list(ngram_counts.values()),
'Frequency': list(ngram_frequency.values()),
'Files': [', '.join(ngram_file_mapping[ngram]) for ngram in ngram_counts.keys()]
})
# Save to Excel
df.to_excel('./ngram_analysis.xlsx', index=False)
@MaxGSEO
Copy link
Author

MaxGSEO commented Oct 23, 2023

pip install scikit-learn
pip install pandas
pip install nltk

import nltk
nltk.download('stopwords')

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment