shuntksh/bm25.py

## bm25.py
import math
from collections import Counter
import string

# Example collection of documents
documents = [
    'the quick brown fox jumps over the lazy dog',
    'never jump over the lazy dog quickly',
    'bright sun and the quick lazy dog',
    'good dogs jump high'
]

# Preprocess documents (to lowercase and remove punctuation)
def preprocess(document):
    return document.lower().translate(str.maketrans('', '', string.punctuation)).split()

# Function to calculate BM25 for a single document and a query
def bm25(document, query, documents, k1=1.5, b=0.75):
    # Preprocess the query in the same way as the documents
    query_terms = preprocess(query)
    # Flatten the list of documents to count the total number of words
    doc_lengths = [len(preprocess(doc)) for doc in documents]
    avg_doc_length = sum(doc_lengths) / len(doc_lengths)

    # Calculate IDF for each term in the query
    N = len(documents)
    idf_scores = {}
    for term in query_terms:
        n_qi = sum(term in preprocess(doc) for doc in documents)
        idf_scores[term] = math.log((N - n_qi + 0.5) / (n_qi + 0.5) + 1)

    # Calculate term frequency in the document
    tf_scores = Counter(preprocess(document))
    # Calculate BM25 score for each term
    bm25_score = 0
    for term in query_terms:
        tf = tf_scores[term]
        idf = idf_scores[term]
        term_score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * len(preprocess(document)) / avg_doc_length))
        bm25_score += term_score

    return bm25_score

# Example query
query = 'quick fox'

# Calculate BM25 for each document given a query
bm25_scores = [bm25(doc, query, documents) for doc in documents]

# Display the BM25 scores for the query
for i, score in enumerate(bm25_scores):
    print(f"Document {i+1}'s BM25 Score for query '{query}': {score:.5f}")

## inverted-index.py
import string

# Example documents
documents = {
    1: 'the quick brown fox jumps over the lazy dog',
    2: 'never jump over the lazy dog quickly',
    3: 'bright sun and the quick lazy dog',
    4: 'good dogs jump high'
}

# Function to preprocess documents
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text.split()

# Create the inverted index
inverted_index = {}

# Fill the inverted index
for doc_id, content in documents.items():
    terms = preprocess(content)
    for term in terms:
        if term in inverted_index:
            inverted_index[term].add(doc_id)
        else:
            inverted_index[term] = {doc_id}

# Display the inverted index
for term, doc_ids in inverted_index.items():
    print(f"{term}: {sorted(list(doc_ids))}")

## tf-idf.py
import math
from collections import Counter
import string

# Example collection of documents
documents = [
    'the quick brown fox jumps over the lazy dog',
    'never jump over the lazy dog quickly',
    'bright sun and the quick lazy dog',
    'good dogs jump high'
]

# Function to calculate Term Frequency (TF)
def computeTF(word_count_dict, document):
    tf_dict = {}
    document_length = len(document)
    for word, count in word_count_dict.items():
        tf_dict[word] = count / float(document_length)
    return tf_dict

# Function to calculate Inverse Document Frequency (IDF)
def computeIDF(doc_list):
    idf_dict = {}
    total_documents = len(doc_list)

    # Count the number of documents that contain each word
    word_document_counts = {}
    for document in doc_list:
        for word in document:
            if word not in word_document_counts:
                word_document_counts[word] = 1
            else:
                if word_document_counts[word] != 1: # This ensures the word is counted only once per document
                    word_document_counts[word] += 1

    # Calculate IDF for each word
    for word in word_document_counts:
        idf_dict[word] = math.log(total_documents / float(word_document_counts[word]))

    return idf_dict

# Function to calculate TF-IDF
def computeTFIDF(tf_dict, idf_dict):
    tfidf_dict = {}
    for word, tf in tf_dict.items():
        tfidf_dict[word] = tf * idf_dict.get(word, 0)
    return tfidf_dict

# Preprocess documents (to lowercase and remove punctuation)
preprocessed_docs = []
for doc in documents:
    # Convert to lowercase and remove punctuation
    doc = doc.lower().translate(str.maketrans('', '', string.punctuation))
    preprocessed_docs.append(doc.split(' '))

# Count words in each document
word_count_dicts = []
for doc in preprocessed_docs:
    word_count_dicts.append(Counter(doc))

# Calculate TF for each document
tf_dicts = [computeTF(word_count, doc) for word_count, doc in zip(word_count_dicts, preprocessed_docs)]

# Calculate IDF (based on all documents)
idf_dict = computeIDF(preprocessed_docs)

# Calculate TF-IDF for each document
tfidf_dicts = [computeTFIDF(tf, idf_dict) for tf in tf_dicts]

# Display the TF-IDF scores for each document
for i, tfidf in enumerate(tfidf_dicts):
    print(f"Document {i+1}'s TF-IDF Scores:")
    for word, score in tfidf.items():
        print(f"\t{word}: {score:.5f}")
	import math
	from collections import Counter
	import string

	# Example collection of documents
	documents = [
	'the quick brown fox jumps over the lazy dog',
	'never jump over the lazy dog quickly',
	'bright sun and the quick lazy dog',
	'good dogs jump high'
	]

	# Preprocess documents (to lowercase and remove punctuation)
	def preprocess(document):
	return document.lower().translate(str.maketrans('', '', string.punctuation)).split()

	# Function to calculate BM25 for a single document and a query
	def bm25(document, query, documents, k1=1.5, b=0.75):
	# Preprocess the query in the same way as the documents
	query_terms = preprocess(query)
	# Flatten the list of documents to count the total number of words
	doc_lengths = [len(preprocess(doc)) for doc in documents]
	avg_doc_length = sum(doc_lengths) / len(doc_lengths)

	# Calculate IDF for each term in the query
	N = len(documents)
	idf_scores = {}
	for term in query_terms:
	n_qi = sum(term in preprocess(doc) for doc in documents)
	idf_scores[term] = math.log((N - n_qi + 0.5) / (n_qi + 0.5) + 1)

	# Calculate term frequency in the document
	tf_scores = Counter(preprocess(document))
	# Calculate BM25 score for each term
	bm25_score = 0
	for term in query_terms:
	tf = tf_scores[term]
	idf = idf_scores[term]
	term_score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * len(preprocess(document)) / avg_doc_length))
	bm25_score += term_score

	return bm25_score

	# Example query
	query = 'quick fox'

	# Calculate BM25 for each document given a query
	bm25_scores = [bm25(doc, query, documents) for doc in documents]

	# Display the BM25 scores for the query
	for i, score in enumerate(bm25_scores):
	print(f"Document {i+1}'s BM25 Score for query '{query}': {score:.5f}")
	import string

	# Example documents
	documents = {
	1: 'the quick brown fox jumps over the lazy dog',
	2: 'never jump over the lazy dog quickly',
	3: 'bright sun and the quick lazy dog',
	4: 'good dogs jump high'
	}

	# Function to preprocess documents
	def preprocess(text):
	text = text.lower()
	text = text.translate(str.maketrans('', '', string.punctuation))
	return text.split()

	# Create the inverted index
	inverted_index = {}

	# Fill the inverted index
	for doc_id, content in documents.items():
	terms = preprocess(content)
	for term in terms:
	if term in inverted_index:
	inverted_index[term].add(doc_id)
	else:
	inverted_index[term] = {doc_id}

	# Display the inverted index
	for term, doc_ids in inverted_index.items():
	print(f"{term}: {sorted(list(doc_ids))}")