Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Implementation of OKapi BM25 with sklearn's TfidfVectorizer
""" Implementation of OKapi BM25 with sklearn's TfidfVectorizer
Distributed as CC-0 (https://creativecommons.org/publicdomain/zero/1.0/)
"""
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
class BM25(object):
def __init__(self, b=0.75, k1=1.6):
self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
self.b = b
self.k1 = k1
def fit(self, X):
""" Fit IDF to documents X """
self.vectorizer.fit(X)
y = super(TfidfVectorizer, self.vectorizer).transform(X)
self.avdl = y.sum(1).mean()
def transform(self, q, X):
""" Calculate BM25 between query q and documents X """
b, k1, avdl = self.b, self.k1, self.avdl
# apply CountVectorizer
X = super(TfidfVectorizer, self.vectorizer).transform(X)
len_X = X.sum(1).A1
q, = super(TfidfVectorizer, self.vectorizer).transform([q])
assert sparse.isspmatrix_csr(q)
# convert to csc for better column slicing
X = X.tocsc()[:, q.indices]
denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
# idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted
# to idf(t) = log [ n / df(t) ] with minus 1
idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)
return (numer / denom).sum(1).A1
#------------ End of library impl. Followings are the example -----------------
from sklearn.datasets import fetch_20newsgroups
texts = fetch_20newsgroups(subset='train').data
bm25 = BM25()
bm25.fit(texts[1:])
print(bm25.transform(texts[0], texts))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment