chrisws/bm25.bas

## bm25.bas
REM SmallBASIC
REM created: 21/09/2019

REM based on the article https://burakkanber.com/blog/machine-learning-full-text-search-in-javascript-relevance-scoring/

const stop_words = ["the", "at", "in", "on"]

func stemmer(text)
  return text
end

func tokenize(text)
  local words, w, out

  split(squeeze(lower(text)), " ", words())
  for w in words
    if w in stop_words == 0 then
      out << stemmer(w)
    end if
  next w
  return out
end

func map_keys(m)
  local i, out
  for i in m
    out << i
  next i
  return out
end

sub updateIdf(byref w)
  local keys = map_keys(w.terms)
  local i, term, num, denom
  for i = 0 to len(keys) - 1
    term = keys[i]
    num = w.totalDocuments - w.terms[term].n + 0.5
    denom = w.terms[term].n + 0.5
    w.terms[term].idf = max(log10(num / denom), 0.01)
  next i
end

sub addDocument(byref w, byref doc)
  local docObj, i, tokens, terms, keys

  if doc.id == 0 then throw "ID is a required property of documents."
  if doc.body == 0 then  throw "Body is a required property of documents."

  #Raw tokenized list of words
  let tokens = tokenize(doc.body)
  # Will hold unique terms and their counts and frequencies
  terms = {}

  #docObj will eventually be added to the documents database  docObj = {id: doc.id, tokens: tokens, body: doc.body}

  # Count number of terms
  docObj.termCount = len(tokens)

  # Increment totalDocuments
  w.totalDocuments++

  # Readjust averageDocumentLength
  w.totalDocumentTermLength += docObj.termCount
  w.averageDocumentLength = this.totalDocumentTermLength / w.totalDocuments

  # Calculate term frequency
  # First get terms count
  for i = 0 to len(tokens) - 1
    term = tokens[i]
    if (! term in terms) then
      terms[term] = {
        count: 0,
        freq: 0
      }
    endif
    terms[term].count++
  next i

  # Then re-loop to calculate term frequency.
  # We'll also update inverse document frequencies here.
  keys = map_keys(terms)
  for i = 0 to len(keys) - 1
    term = keys[i]
    # Term Frequency for this document.
    terms[term].freq = terms[term].count / docObj.termCount
    #Inverse Document Frequency initialization
    if (! term in w.terms[term]) then
       w.terms[term] = {
         n: 0,
         idf: 0
       }
    end if
    w.terms[term].n++
  next i

  # Calculate inverse document frequencies
  # This is SLOWish so if you want to index a big batch of documents,
  # comment this out and run it once at the end of your addDocuments run
  # If you're only indexing a document or two at a time you can leave this in

  # Add docObj to docs db
  docObj.terms = terms
  docObj.id = doc.id
  w.documents[doc.id] = docObj
  w.nDocs++
end

func find(w, query)
  local j, id, i, queryTerm
  local queryTerms = tokenize(query)
  local results = []
  # Look at each document in turn.
  # There are better ways to do this with inverted indices.
  local keys = map_keys(w.documents)

  for j = 0 to min(len(keys), w.nDocs) - 1
    id = keys[j]
    # The relevance score for a document is the sum of a tf-idf-like
    # calculation for each query term.
    w.documents[id]._score = 0
    # Calculate the score for each query term
    #
    for i = 0 to len(queryTerms) - 1
      queryTerm = queryTerms[i]
      # We've never seen this term before so IDF will be 0.
      # Means we can skip the whole term, it adds nothing to the score
      # and isn't in any document.
      #
      if ismap(w.terms[queryTerm]) && ismap(w.documents[id].terms[queryTerm]) then
        # The term is in the document, let's go.
        # The whole term is :
        # IDF * (TF * (k1 + 1)) / (TF + k1 * (1 - b + b * docLength / avgDocLength))
        # IDF is pre-calculated for the whole docset.
        idf = w.terms[queryTerm].idf
        # Numerator of the TF portion.
        num = w.documents[id].terms[queryTerm].count * (w.k1 + 1)
        # Denomerator of the TF portion.
        denom = w.documents[id].terms[queryTerm].count &
          + (w.k1 * (1 - w.b + (w.b * w.documents[id].termCount &
         / max(1, w.averageDocumentLength))))
        # Add this query term to the score
        w.documents[id]._score += idf * num / denom
      end if
    next i
    if w.documents[id]._score > 0 then
      results << w.documents[id]
    end if
  next j
  rem results.sort(function(a, b) { return b._score - a._score; })
  #return results.slice(0, 10)
  return results
end

w = {}

doc.body="the cat the cat in the hat sat on  the mat"
doc.id = 1
addDocument(w, doc)

doc.body="text sat"
doc.id = 2
addDocument(w, doc)

updateIdf(w)
searchResult = find(w, "sat")
if (isarray(searchResult)) then
  for item in searchResult
    print item.id
  next item
endif
	REM SmallBASIC
	REM created: 21/09/2019

	REM based on the article https://burakkanber.com/blog/machine-learning-full-text-search-in-javascript-relevance-scoring/

	const stop_words = ["the", "at", "in", "on"]

	func stemmer(text)
	return text
	end

	func tokenize(text)
	local words, w, out

	split(squeeze(lower(text)), " ", words())
	for w in words
	if w in stop_words == 0 then
	out << stemmer(w)
	end if
	next w
	return out
	end

	func map_keys(m)
	local i, out
	for i in m
	out << i
	next i
	return out
	end

	sub updateIdf(byref w)
	local keys = map_keys(w.terms)
	local i, term, num, denom
	for i = 0 to len(keys) - 1
	term = keys[i]
	num = w.totalDocuments - w.terms[term].n + 0.5
	denom = w.terms[term].n + 0.5
	w.terms[term].idf = max(log10(num / denom), 0.01)
	next i
	end

	sub addDocument(byref w, byref doc)
	local docObj, i, tokens, terms, keys

	if doc.id == 0 then throw "ID is a required property of documents."
	if doc.body == 0 then throw "Body is a required property of documents."

	#Raw tokenized list of words
	let tokens = tokenize(doc.body)
	# Will hold unique terms and their counts and frequencies
	terms = {}

	#docObj will eventually be added to the documents database docObj = {id: doc.id, tokens: tokens, body: doc.body}

	# Count number of terms
	docObj.termCount = len(tokens)

	# Increment totalDocuments
	w.totalDocuments++

	# Readjust averageDocumentLength
	w.totalDocumentTermLength += docObj.termCount
	w.averageDocumentLength = this.totalDocumentTermLength / w.totalDocuments

	# Calculate term frequency
	# First get terms count
	for i = 0 to len(tokens) - 1
	term = tokens[i]
	if (! term in terms) then
	terms[term] = {
	count: 0,
	freq: 0
	}
	endif
	terms[term].count++
	next i

	# Then re-loop to calculate term frequency.
	# We'll also update inverse document frequencies here.
	keys = map_keys(terms)
	for i = 0 to len(keys) - 1
	term = keys[i]
	# Term Frequency for this document.
	terms[term].freq = terms[term].count / docObj.termCount
	#Inverse Document Frequency initialization
	if (! term in w.terms[term]) then
	w.terms[term] = {
	n: 0,
	idf: 0
	}
	end if
	w.terms[term].n++
	next i

	# Calculate inverse document frequencies
	# This is SLOWish so if you want to index a big batch of documents,
	# comment this out and run it once at the end of your addDocuments run
	# If you're only indexing a document or two at a time you can leave this in

	# Add docObj to docs db
	docObj.terms = terms
	docObj.id = doc.id
	w.documents[doc.id] = docObj
	w.nDocs++
	end

	func find(w, query)
	local j, id, i, queryTerm
	local queryTerms = tokenize(query)
	local results = []
	# Look at each document in turn.
	# There are better ways to do this with inverted indices.
	local keys = map_keys(w.documents)

	for j = 0 to min(len(keys), w.nDocs) - 1
	id = keys[j]
	# The relevance score for a document is the sum of a tf-idf-like
	# calculation for each query term.
	w.documents[id]._score = 0
	# Calculate the score for each query term
	#
	for i = 0 to len(queryTerms) - 1
	queryTerm = queryTerms[i]
	# We've never seen this term before so IDF will be 0.
	# Means we can skip the whole term, it adds nothing to the score
	# and isn't in any document.
	#
	if ismap(w.terms[queryTerm]) && ismap(w.documents[id].terms[queryTerm]) then
	# The term is in the document, let's go.
	# The whole term is :
	# IDF * (TF * (k1 + 1)) / (TF + k1 * (1 - b + b * docLength / avgDocLength))
	# IDF is pre-calculated for the whole docset.
	idf = w.terms[queryTerm].idf
	# Numerator of the TF portion.
	num = w.documents[id].terms[queryTerm].count * (w.k1 + 1)
	# Denomerator of the TF portion.
	denom = w.documents[id].terms[queryTerm].count &
	+ (w.k1 * (1 - w.b + (w.b * w.documents[id].termCount &
	/ max(1, w.averageDocumentLength))))
	# Add this query term to the score
	w.documents[id]._score += idf * num / denom
	end if
	next i
	if w.documents[id]._score > 0 then
	results << w.documents[id]
	end if
	next j
	rem results.sort(function(a, b) { return b._score - a._score; })
	#return results.slice(0, 10)
	return results
	end

	w = {}

	doc.body="the cat the cat in the hat sat on the mat"
	doc.id = 1
	addDocument(w, doc)

	doc.body="text sat"
	doc.id = 2
	addDocument(w, doc)

	updateIdf(w)
	searchResult = find(w, "sat")
	if (isarray(searchResult)) then
	for item in searchResult
	print item.id
	next item
	endif