Skip to content

Instantly share code, notes, and snippets.

@chrisws

chrisws/bm25.bas

Last active May 29, 2020
Embed
What would you like to do?
BM 2.5
REM SmallBASIC
REM created: 21/09/2019
REM based on the article https://burakkanber.com/blog/machine-learning-full-text-search-in-javascript-relevance-scoring/
const stop_words = ["the", "at", "in", "on"]
func stemmer(text)
return text
end
func tokenize(text)
local words, w, out
split(squeeze(lower(text)), " ", words())
for w in words
if w in stop_words == 0 then
out << stemmer(w)
end if
next w
return out
end
func map_keys(m)
local i, out
for i in m
out << i
next i
return out
end
sub updateIdf(byref w)
local keys = map_keys(w.terms)
local i, term, num, denom
for i = 0 to len(keys) - 1
term = keys[i]
num = w.totalDocuments - w.terms[term].n + 0.5
denom = w.terms[term].n + 0.5
w.terms[term].idf = max(log10(num / denom), 0.01)
next i
end
sub addDocument(byref w, byref doc)
local docObj, i, tokens, terms, keys
if doc.id == 0 then throw "ID is a required property of documents."
if doc.body == 0 then throw "Body is a required property of documents."
#Raw tokenized list of words
let tokens = tokenize(doc.body)
# Will hold unique terms and their counts and frequencies
terms = {}
#docObj will eventually be added to the documents database docObj = {id: doc.id, tokens: tokens, body: doc.body}
# Count number of terms
docObj.termCount = len(tokens)
# Increment totalDocuments
w.totalDocuments++
# Readjust averageDocumentLength
w.totalDocumentTermLength += docObj.termCount
w.averageDocumentLength = this.totalDocumentTermLength / w.totalDocuments
# Calculate term frequency
# First get terms count
for i = 0 to len(tokens) - 1
term = tokens[i]
if (! term in terms) then
terms[term] = {
count: 0,
freq: 0
}
endif
terms[term].count++
next i
# Then re-loop to calculate term frequency.
# We'll also update inverse document frequencies here.
keys = map_keys(terms)
for i = 0 to len(keys) - 1
term = keys[i]
# Term Frequency for this document.
terms[term].freq = terms[term].count / docObj.termCount
#Inverse Document Frequency initialization
if (! term in w.terms[term]) then
w.terms[term] = {
n: 0,
idf: 0
}
end if
w.terms[term].n++
next i
# Calculate inverse document frequencies
# This is SLOWish so if you want to index a big batch of documents,
# comment this out and run it once at the end of your addDocuments run
# If you're only indexing a document or two at a time you can leave this in
# Add docObj to docs db
docObj.terms = terms
docObj.id = doc.id
w.documents[doc.id] = docObj
w.nDocs++
end
func find(w, query)
local j, id, i, queryTerm
local queryTerms = tokenize(query)
local results = []
# Look at each document in turn.
# There are better ways to do this with inverted indices.
local keys = map_keys(w.documents)
for j = 0 to min(len(keys), w.nDocs) - 1
id = keys[j]
# The relevance score for a document is the sum of a tf-idf-like
# calculation for each query term.
w.documents[id]._score = 0
# Calculate the score for each query term
#
for i = 0 to len(queryTerms) - 1
queryTerm = queryTerms[i]
# We've never seen this term before so IDF will be 0.
# Means we can skip the whole term, it adds nothing to the score
# and isn't in any document.
#
if ismap(w.terms[queryTerm]) && ismap(w.documents[id].terms[queryTerm]) then
# The term is in the document, let's go.
# The whole term is :
# IDF * (TF * (k1 + 1)) / (TF + k1 * (1 - b + b * docLength / avgDocLength))
# IDF is pre-calculated for the whole docset.
idf = w.terms[queryTerm].idf
# Numerator of the TF portion.
num = w.documents[id].terms[queryTerm].count * (w.k1 + 1)
# Denomerator of the TF portion.
denom = w.documents[id].terms[queryTerm].count &
+ (w.k1 * (1 - w.b + (w.b * w.documents[id].termCount &
/ max(1, w.averageDocumentLength))))
# Add this query term to the score
w.documents[id]._score += idf * num / denom
end if
next i
if w.documents[id]._score > 0 then
results << w.documents[id]
end if
next j
rem results.sort(function(a, b) { return b._score - a._score; })
#return results.slice(0, 10)
return results
end
w = {}
doc.body="the cat the cat in the hat sat on the mat"
doc.id = 1
addDocument(w, doc)
doc.body="text sat"
doc.id = 2
addDocument(w, doc)
updateIdf(w)
searchResult = find(w, "sat")
if (isarray(searchResult)) then
for item in searchResult
print item.id
next item
endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.