Skip to content

Instantly share code, notes, and snippets.

@benob
Created March 20, 2022 11:33
Show Gist options
  • Save benob/69d48421f88f5dcc2b26a204d3251d8a to your computer and use it in GitHub Desktop.
Save benob/69d48421f88f5dcc2b26a204d3251d8a to your computer and use it in GitHub Desktop.
Simple implementation of BM25 Okapi index in Python
from collections import defaultdict
from math import log
stopwords = {'le', 'la', 'du'}
documents = ['le chat boit du lait', 'le chien aime le chat', 'la souris aime le chien']
index = defaultdict(list)
doc_length = []
# Create the index
for doc_id, text in enumerate(documents):
count = defaultdict(int)
words = [word for word in text.split() if word not in stopwords]
# Count the number of occurrences of each word in the document
for word in words:
count[word] += 1
for word in count:
index[word].append((doc_id, count[word]))
doc_length.append(len(words))
# Display the index
for word in index:
print(word, index[word])
# Formulas for the BM25 model
N = len(doc_length)
avg_doc_length = sum(doc_length) / N
k1 = 1.2
b = 0.75
def idf(word):
df = len(index[word])
return log(1 + (N - df + 0.5) / (df + 0.5))
def tf(word, count, doc_id):
return (count * (k1 + 1)) / (count + k1 * (1 - b + b * doc_length[doc_id] / avg_doc_length))
# Score documents for a simple query
query = 'chien et chat'
scores = defaultdict(float)
# Compute the score of documents from each word in the query
for word in query.split():
for doc_id, count in index[word]:
scores[doc_id] += idf(word) * tf(word, count, doc_id)
# Display the sorted results by decreasing relevance
for doc_id, score in sorted(scores.items(), key=lambda pair: -pair[1]):
print(score, doc_id, documents[doc_id])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment