Skip to content

Instantly share code, notes, and snippets.

@shuntksh
Created November 7, 2023 06:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shuntksh/d74b817371772129a952c8558f44d21b to your computer and use it in GitHub Desktop.
Save shuntksh/d74b817371772129a952c8558f44d21b to your computer and use it in GitHub Desktop.
TF-IDF and BM25
import math
from collections import Counter
import string
# Example collection of documents
documents = [
'the quick brown fox jumps over the lazy dog',
'never jump over the lazy dog quickly',
'bright sun and the quick lazy dog',
'good dogs jump high'
]
# Preprocess documents (to lowercase and remove punctuation)
def preprocess(document):
return document.lower().translate(str.maketrans('', '', string.punctuation)).split()
# Function to calculate BM25 for a single document and a query
def bm25(document, query, documents, k1=1.5, b=0.75):
# Preprocess the query in the same way as the documents
query_terms = preprocess(query)
# Flatten the list of documents to count the total number of words
doc_lengths = [len(preprocess(doc)) for doc in documents]
avg_doc_length = sum(doc_lengths) / len(doc_lengths)
# Calculate IDF for each term in the query
N = len(documents)
idf_scores = {}
for term in query_terms:
n_qi = sum(term in preprocess(doc) for doc in documents)
idf_scores[term] = math.log((N - n_qi + 0.5) / (n_qi + 0.5) + 1)
# Calculate term frequency in the document
tf_scores = Counter(preprocess(document))
# Calculate BM25 score for each term
bm25_score = 0
for term in query_terms:
tf = tf_scores[term]
idf = idf_scores[term]
term_score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * len(preprocess(document)) / avg_doc_length))
bm25_score += term_score
return bm25_score
# Example query
query = 'quick fox'
# Calculate BM25 for each document given a query
bm25_scores = [bm25(doc, query, documents) for doc in documents]
# Display the BM25 scores for the query
for i, score in enumerate(bm25_scores):
print(f"Document {i+1}'s BM25 Score for query '{query}': {score:.5f}")
import string
# Example documents
documents = {
1: 'the quick brown fox jumps over the lazy dog',
2: 'never jump over the lazy dog quickly',
3: 'bright sun and the quick lazy dog',
4: 'good dogs jump high'
}
# Function to preprocess documents
def preprocess(text):
text = text.lower()
text = text.translate(str.maketrans('', '', string.punctuation))
return text.split()
# Create the inverted index
inverted_index = {}
# Fill the inverted index
for doc_id, content in documents.items():
terms = preprocess(content)
for term in terms:
if term in inverted_index:
inverted_index[term].add(doc_id)
else:
inverted_index[term] = {doc_id}
# Display the inverted index
for term, doc_ids in inverted_index.items():
print(f"{term}: {sorted(list(doc_ids))}")
import math
from collections import Counter
import string
# Example collection of documents
documents = [
'the quick brown fox jumps over the lazy dog',
'never jump over the lazy dog quickly',
'bright sun and the quick lazy dog',
'good dogs jump high'
]
# Function to calculate Term Frequency (TF)
def computeTF(word_count_dict, document):
tf_dict = {}
document_length = len(document)
for word, count in word_count_dict.items():
tf_dict[word] = count / float(document_length)
return tf_dict
# Function to calculate Inverse Document Frequency (IDF)
def computeIDF(doc_list):
idf_dict = {}
total_documents = len(doc_list)
# Count the number of documents that contain each word
word_document_counts = {}
for document in doc_list:
for word in document:
if word not in word_document_counts:
word_document_counts[word] = 1
else:
if word_document_counts[word] != 1: # This ensures the word is counted only once per document
word_document_counts[word] += 1
# Calculate IDF for each word
for word in word_document_counts:
idf_dict[word] = math.log(total_documents / float(word_document_counts[word]))
return idf_dict
# Function to calculate TF-IDF
def computeTFIDF(tf_dict, idf_dict):
tfidf_dict = {}
for word, tf in tf_dict.items():
tfidf_dict[word] = tf * idf_dict.get(word, 0)
return tfidf_dict
# Preprocess documents (to lowercase and remove punctuation)
preprocessed_docs = []
for doc in documents:
# Convert to lowercase and remove punctuation
doc = doc.lower().translate(str.maketrans('', '', string.punctuation))
preprocessed_docs.append(doc.split(' '))
# Count words in each document
word_count_dicts = []
for doc in preprocessed_docs:
word_count_dicts.append(Counter(doc))
# Calculate TF for each document
tf_dicts = [computeTF(word_count, doc) for word_count, doc in zip(word_count_dicts, preprocessed_docs)]
# Calculate IDF (based on all documents)
idf_dict = computeIDF(preprocessed_docs)
# Calculate TF-IDF for each document
tfidf_dicts = [computeTFIDF(tf, idf_dict) for tf in tf_dicts]
# Display the TF-IDF scores for each document
for i, tfidf in enumerate(tfidf_dicts):
print(f"Document {i+1}'s TF-IDF Scores:")
for word, score in tfidf.items():
print(f"\t{word}: {score:.5f}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment