benob/bm25.py

## bm25.py
from collections import defaultdict
from math import log

stopwords = {'le', 'la', 'du'}
documents = ['le chat boit du lait', 'le chien aime le chat', 'la souris aime le chien']

index = defaultdict(list)
doc_length = []

# Create the index
for doc_id, text in enumerate(documents):
  count = defaultdict(int)
  words = [word for word in text.split() if word not in stopwords]
  # Count the number of occurrences of each word in the document
  for word in words:
    count[word] += 1
  for word in count:
    index[word].append((doc_id, count[word]))
  doc_length.append(len(words))

# Display the index
for word in index:
  print(word, index[word])

# Formulas for the BM25 model
N = len(doc_length)
avg_doc_length = sum(doc_length) / N
k1 = 1.2
b = 0.75

def idf(word):
  df = len(index[word])
  return log(1 + (N - df + 0.5) / (df + 0.5))

def tf(word, count, doc_id):
  return (count * (k1 + 1)) / (count + k1 * (1 - b + b * doc_length[doc_id] / avg_doc_length))

# Score documents for a simple query
query = 'chien et chat'
scores = defaultdict(float)

# Compute the score of documents from each word in the query
for word in query.split():
  for doc_id, count in index[word]:
    scores[doc_id] += idf(word) * tf(word, count, doc_id)

# Display the sorted results by decreasing relevance
for doc_id, score in sorted(scores.items(), key=lambda pair: -pair[1]):
  print(score, doc_id, documents[doc_id])
	from collections import defaultdict
	from math import log

	stopwords = {'le', 'la', 'du'}
	documents = ['le chat boit du lait', 'le chien aime le chat', 'la souris aime le chien']

	index = defaultdict(list)
	doc_length = []

	# Create the index
	for doc_id, text in enumerate(documents):
	count = defaultdict(int)
	words = [word for word in text.split() if word not in stopwords]
	# Count the number of occurrences of each word in the document
	for word in words:
	count[word] += 1
	for word in count:
	index[word].append((doc_id, count[word]))
	doc_length.append(len(words))

	# Display the index
	for word in index:
	print(word, index[word])

	# Formulas for the BM25 model
	N = len(doc_length)
	avg_doc_length = sum(doc_length) / N
	k1 = 1.2
	b = 0.75

	def idf(word):
	df = len(index[word])
	return log(1 + (N - df + 0.5) / (df + 0.5))

	def tf(word, count, doc_id):
	return (count * (k1 + 1)) / (count + k1 * (1 - b + b * doc_length[doc_id] / avg_doc_length))

	# Score documents for a simple query
	query = 'chien et chat'
	scores = defaultdict(float)

	# Compute the score of documents from each word in the query
	for word in query.split():
	for doc_id, count in index[word]:
	scores[doc_id] += idf(word) * tf(word, count, doc_id)

	# Display the sorted results by decreasing relevance
	for doc_id, score in sorted(scores.items(), key=lambda pair: -pair[1]):
	print(score, doc_id, documents[doc_id])