Created
November 7, 2023 06:32
-
-
Save shuntksh/d74b817371772129a952c8558f44d21b to your computer and use it in GitHub Desktop.
TF-IDF and BM25
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
from collections import Counter | |
import string | |
# Example collection of documents | |
documents = [ | |
'the quick brown fox jumps over the lazy dog', | |
'never jump over the lazy dog quickly', | |
'bright sun and the quick lazy dog', | |
'good dogs jump high' | |
] | |
# Preprocess documents (to lowercase and remove punctuation) | |
def preprocess(document): | |
return document.lower().translate(str.maketrans('', '', string.punctuation)).split() | |
# Function to calculate BM25 for a single document and a query | |
def bm25(document, query, documents, k1=1.5, b=0.75): | |
# Preprocess the query in the same way as the documents | |
query_terms = preprocess(query) | |
# Flatten the list of documents to count the total number of words | |
doc_lengths = [len(preprocess(doc)) for doc in documents] | |
avg_doc_length = sum(doc_lengths) / len(doc_lengths) | |
# Calculate IDF for each term in the query | |
N = len(documents) | |
idf_scores = {} | |
for term in query_terms: | |
n_qi = sum(term in preprocess(doc) for doc in documents) | |
idf_scores[term] = math.log((N - n_qi + 0.5) / (n_qi + 0.5) + 1) | |
# Calculate term frequency in the document | |
tf_scores = Counter(preprocess(document)) | |
# Calculate BM25 score for each term | |
bm25_score = 0 | |
for term in query_terms: | |
tf = tf_scores[term] | |
idf = idf_scores[term] | |
term_score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * len(preprocess(document)) / avg_doc_length)) | |
bm25_score += term_score | |
return bm25_score | |
# Example query | |
query = 'quick fox' | |
# Calculate BM25 for each document given a query | |
bm25_scores = [bm25(doc, query, documents) for doc in documents] | |
# Display the BM25 scores for the query | |
for i, score in enumerate(bm25_scores): | |
print(f"Document {i+1}'s BM25 Score for query '{query}': {score:.5f}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
# Example documents | |
documents = { | |
1: 'the quick brown fox jumps over the lazy dog', | |
2: 'never jump over the lazy dog quickly', | |
3: 'bright sun and the quick lazy dog', | |
4: 'good dogs jump high' | |
} | |
# Function to preprocess documents | |
def preprocess(text): | |
text = text.lower() | |
text = text.translate(str.maketrans('', '', string.punctuation)) | |
return text.split() | |
# Create the inverted index | |
inverted_index = {} | |
# Fill the inverted index | |
for doc_id, content in documents.items(): | |
terms = preprocess(content) | |
for term in terms: | |
if term in inverted_index: | |
inverted_index[term].add(doc_id) | |
else: | |
inverted_index[term] = {doc_id} | |
# Display the inverted index | |
for term, doc_ids in inverted_index.items(): | |
print(f"{term}: {sorted(list(doc_ids))}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
from collections import Counter | |
import string | |
# Example collection of documents | |
documents = [ | |
'the quick brown fox jumps over the lazy dog', | |
'never jump over the lazy dog quickly', | |
'bright sun and the quick lazy dog', | |
'good dogs jump high' | |
] | |
# Function to calculate Term Frequency (TF) | |
def computeTF(word_count_dict, document): | |
tf_dict = {} | |
document_length = len(document) | |
for word, count in word_count_dict.items(): | |
tf_dict[word] = count / float(document_length) | |
return tf_dict | |
# Function to calculate Inverse Document Frequency (IDF) | |
def computeIDF(doc_list): | |
idf_dict = {} | |
total_documents = len(doc_list) | |
# Count the number of documents that contain each word | |
word_document_counts = {} | |
for document in doc_list: | |
for word in document: | |
if word not in word_document_counts: | |
word_document_counts[word] = 1 | |
else: | |
if word_document_counts[word] != 1: # This ensures the word is counted only once per document | |
word_document_counts[word] += 1 | |
# Calculate IDF for each word | |
for word in word_document_counts: | |
idf_dict[word] = math.log(total_documents / float(word_document_counts[word])) | |
return idf_dict | |
# Function to calculate TF-IDF | |
def computeTFIDF(tf_dict, idf_dict): | |
tfidf_dict = {} | |
for word, tf in tf_dict.items(): | |
tfidf_dict[word] = tf * idf_dict.get(word, 0) | |
return tfidf_dict | |
# Preprocess documents (to lowercase and remove punctuation) | |
preprocessed_docs = [] | |
for doc in documents: | |
# Convert to lowercase and remove punctuation | |
doc = doc.lower().translate(str.maketrans('', '', string.punctuation)) | |
preprocessed_docs.append(doc.split(' ')) | |
# Count words in each document | |
word_count_dicts = [] | |
for doc in preprocessed_docs: | |
word_count_dicts.append(Counter(doc)) | |
# Calculate TF for each document | |
tf_dicts = [computeTF(word_count, doc) for word_count, doc in zip(word_count_dicts, preprocessed_docs)] | |
# Calculate IDF (based on all documents) | |
idf_dict = computeIDF(preprocessed_docs) | |
# Calculate TF-IDF for each document | |
tfidf_dicts = [computeTFIDF(tf, idf_dict) for tf in tf_dicts] | |
# Display the TF-IDF scores for each document | |
for i, tfidf in enumerate(tfidf_dicts): | |
print(f"Document {i+1}'s TF-IDF Scores:") | |
for word, score in tfidf.items(): | |
print(f"\t{word}: {score:.5f}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment