-
-
Save MaxGSEO/011e5c6106ffbcda21cb88db33fafbc4 to your computer and use it in GitHub Desktop.
Python script that focuses solely on TF-IDF and n-gram extraction. It reads text from 10 files, performs the TF-IDF analysis, and then exports the results to an Excel sheet, including the n-gram, the files in which it appears, its count, and its frequency.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import TfidfVectorizer | |
from nltk.corpus import stopwords | |
from collections import Counter | |
import pandas as pd | |
import os | |
# Read text from files | |
texts = {} | |
for filename in os.listdir('./texts/'): # Assuming the files are in a folder called 'texts' | |
if filename.endswith('.txt'): | |
with open(f'./texts/{filename}', 'r') as f: | |
texts[filename] = f.read() | |
# Generate TF-IDF features | |
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 5), stop_words=None) | |
tfidf_matrix = tfidf_vectorizer.fit_transform(texts.values()) | |
feature_names = tfidf_vectorizer.get_feature_names_out() | |
# Filter out n-grams that start or end with a stopword | |
english_stopwords = set(stopwords.words('english')) | |
italian_stopwords = set(stopwords.words('italian')) | |
filtered_feature_names = [feature for feature in feature_names if feature.split()[0] not in english_stopwords and feature.split()[-1] not in english_stopwords and feature.split()[0] not in italian_stopwords and feature.split()[-1] not in italian_stopwords] | |
# Filter out n-grams that contain the same word twice | |
filtered_feature_names = [feature for feature in filtered_feature_names if len(set(feature.split())) == len(feature.split())] | |
# Counting n-gram occurrence and frequency | |
ngram_counts = Counter() | |
ngram_file_mapping = {} | |
for filename, text in texts.items(): | |
for feature in filtered_feature_names: | |
count = text.count(feature) | |
if count > 0: | |
ngram_counts[feature] += count | |
if feature in ngram_file_mapping: | |
ngram_file_mapping[feature].append(filename) | |
else: | |
ngram_file_mapping[feature] = [filename] | |
# Calculating frequency | |
total_ngrams = sum(ngram_counts.values()) | |
ngram_frequency = {ngram: count / total_ngrams for ngram, count in ngram_counts.items()} | |
# Creating a DataFrame | |
df = pd.DataFrame({ | |
'N-gram': list(ngram_counts.keys()), | |
'Count': list(ngram_counts.values()), | |
'Frequency': list(ngram_frequency.values()), | |
'Files': [', '.join(ngram_file_mapping[ngram]) for ngram in ngram_counts.keys()] | |
}) | |
# Save to Excel | |
df.to_excel('./ngram_analysis.xlsx', index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
pip install scikit-learn
pip install pandas
pip install nltk
import nltk
nltk.download('stopwords')