Skip to content

Instantly share code, notes, and snippets.

@benob
Created March 25, 2021 19:55
Show Gist options
  • Save benob/d0e74565c87007be65b781b8652884de to your computer and use it in GitHub Desktop.
Save benob/d0e74565c87007be65b781b8652884de to your computer and use it in GitHub Desktop.
simple in-memory search engine with bm25 a.k.a. okapi ranking function
"use strict"
var fs = require('fs');
var stopwords = fs.readFileSync('stopwords.txt').toString().split('\n').reduce((set, line) => {set.add(line.trim()); return set}, new Set());
var index = {
termFrequency: new Map(),
documentLength: [],
averageDocumentLength: 0,
k1: 1.2,
b: 0.75,
}
function tokenize(text) {
var tokens = [];
for(var word of text.trim().toLowerCase().replace(/[^a-z0-9-]/g, ' ').split(/\s+/)) {
if(!stopwords.has(word)) {
tokens.push(word);
}
}
return tokens;
}
function vectorize(text) {
var vector = new Map();
for(var word of tokenize(text)) {
if(!vector.has(word)) vector.set(word, 1);
else vector.set(word, 1 + vector.get(word));
}
return vector;
}
function add_to_index(text, id) {
const termFrequency = index.termFrequency;
var documentLength = 0;
vectorize(text).forEach((count, word) => {
if(!index.termFrequency.has(word)) termFrequency.set(word, []);
termFrequency.get(word).push(id, count);
documentLength += count;
});
return documentLength;
}
function build_index(entries) {
var i = 0;
for(var text of entries) {
const length = add_to_index(text, i);
index.documentLength.push(length);
index.averageDocumentLength += length;
i++;
}
index.averageDocumentLength /= index.documentLength.length;
}
function load_index(filename) {
index = JSON.parse(fs.readFileSync(filename).toString());
index.termFrequency = new Map(index.termFrequency);
}
function save_index(filename) {
const backup = index.termFrequency;
index.termFrequency = Array.from(backup.entries());
fs.writeFileSync(filename, JSON.stringify(index));
index.termFrequency = backup;
}
function inverseDocumentFrequency(word) {
if(index.termFrequency.has(word)) {
const documentFrequency = index.termFrequency.get(word).length / 2;
return Math.log( (index.documentLength.length - documentFrequency + 0.5) / (documentFrequency + 0.5) );
}
return 0;
}
function okapi_bm25(query) {
var found = new Map();
tokenize(query).forEach((word) => {
if(index.termFrequency.has(word)) {
const idf = inverseDocumentFrequency(word);
const entries = index.termFrequency.get(word);
for(var i = 0; i < entries.length; i += 2) {
const doc = entries[i], termFrequency = entries[i + 1];
const score = idf * (termFrequency * (index.k1 + 1)) / termFrequency + index.k1 * (1 - index.b + index.b * index.documentLength[doc] / index.averageDocumentLength);
if(!found.has(doc)) found.set(doc, score);
else found.set(doc, score + found.get(doc));
}
}
});
return found;
}
function search(query) {
query = query || "";
var found = okapi_bm25(query);
var results = Array.from(found.entries());
results.sort((a, b) => b[1] - a[1]);
return results;
};
//var docs = JSON.parse(fs.readFileSync(process.argv[2]).toString());
//build_index(docs.map(doc => doc.title + ' ' + doc.abstract));
//save_index('index.bm25');
//load_index('index.bm25');
//console.log(search('virus'));
module.exports = {
build_index: build_index,
save_index: save_index,
load_index: load_index,
search: search,
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment