Skip to content

Instantly share code, notes, and snippets.

@chrisws
Last active May 29, 2020 08:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chrisws/3e33f0f785ebeb92b41ad7c5af893a12 to your computer and use it in GitHub Desktop.
Save chrisws/3e33f0f785ebeb92b41ad7c5af893a12 to your computer and use it in GitHub Desktop.
BM 2.5
REM SmallBASIC
REM created: 21/09/2019
REM based on the article https://burakkanber.com/blog/machine-learning-full-text-search-in-javascript-relevance-scoring/
const stop_words = ["the", "at", "in", "on"]
func stemmer(text)
return text
end
func tokenize(text)
local words, w, out
split(squeeze(lower(text)), " ", words())
for w in words
if w in stop_words == 0 then
out << stemmer(w)
end if
next w
return out
end
func map_keys(m)
local i, out
for i in m
out << i
next i
return out
end
sub updateIdf(byref w)
local keys = map_keys(w.terms)
local i, term, num, denom
for i = 0 to len(keys) - 1
term = keys[i]
num = w.totalDocuments - w.terms[term].n + 0.5
denom = w.terms[term].n + 0.5
w.terms[term].idf = max(log10(num / denom), 0.01)
next i
end
sub addDocument(byref w, byref doc)
local docObj, i, tokens, terms, keys
if doc.id == 0 then throw "ID is a required property of documents."
if doc.body == 0 then throw "Body is a required property of documents."
#Raw tokenized list of words
let tokens = tokenize(doc.body)
# Will hold unique terms and their counts and frequencies
terms = {}
#docObj will eventually be added to the documents database docObj = {id: doc.id, tokens: tokens, body: doc.body}
# Count number of terms
docObj.termCount = len(tokens)
# Increment totalDocuments
w.totalDocuments++
# Readjust averageDocumentLength
w.totalDocumentTermLength += docObj.termCount
w.averageDocumentLength = this.totalDocumentTermLength / w.totalDocuments
# Calculate term frequency
# First get terms count
for i = 0 to len(tokens) - 1
term = tokens[i]
if (! term in terms) then
terms[term] = {
count: 0,
freq: 0
}
endif
terms[term].count++
next i
# Then re-loop to calculate term frequency.
# We'll also update inverse document frequencies here.
keys = map_keys(terms)
for i = 0 to len(keys) - 1
term = keys[i]
# Term Frequency for this document.
terms[term].freq = terms[term].count / docObj.termCount
#Inverse Document Frequency initialization
if (! term in w.terms[term]) then
w.terms[term] = {
n: 0,
idf: 0
}
end if
w.terms[term].n++
next i
# Calculate inverse document frequencies
# This is SLOWish so if you want to index a big batch of documents,
# comment this out and run it once at the end of your addDocuments run
# If you're only indexing a document or two at a time you can leave this in
# Add docObj to docs db
docObj.terms = terms
docObj.id = doc.id
w.documents[doc.id] = docObj
w.nDocs++
end
func find(w, query)
local j, id, i, queryTerm
local queryTerms = tokenize(query)
local results = []
# Look at each document in turn.
# There are better ways to do this with inverted indices.
local keys = map_keys(w.documents)
for j = 0 to min(len(keys), w.nDocs) - 1
id = keys[j]
# The relevance score for a document is the sum of a tf-idf-like
# calculation for each query term.
w.documents[id]._score = 0
# Calculate the score for each query term
#
for i = 0 to len(queryTerms) - 1
queryTerm = queryTerms[i]
# We've never seen this term before so IDF will be 0.
# Means we can skip the whole term, it adds nothing to the score
# and isn't in any document.
#
if ismap(w.terms[queryTerm]) && ismap(w.documents[id].terms[queryTerm]) then
# The term is in the document, let's go.
# The whole term is :
# IDF * (TF * (k1 + 1)) / (TF + k1 * (1 - b + b * docLength / avgDocLength))
# IDF is pre-calculated for the whole docset.
idf = w.terms[queryTerm].idf
# Numerator of the TF portion.
num = w.documents[id].terms[queryTerm].count * (w.k1 + 1)
# Denomerator of the TF portion.
denom = w.documents[id].terms[queryTerm].count &
+ (w.k1 * (1 - w.b + (w.b * w.documents[id].termCount &
/ max(1, w.averageDocumentLength))))
# Add this query term to the score
w.documents[id]._score += idf * num / denom
end if
next i
if w.documents[id]._score > 0 then
results << w.documents[id]
end if
next j
rem results.sort(function(a, b) { return b._score - a._score; })
#return results.slice(0, 10)
return results
end
w = {}
doc.body="the cat the cat in the hat sat on the mat"
doc.id = 1
addDocument(w, doc)
doc.body="text sat"
doc.id = 2
addDocument(w, doc)
updateIdf(w)
searchResult = find(w, "sat")
if (isarray(searchResult)) then
for item in searchResult
print item.id
next item
endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment