Skip to content

Instantly share code, notes, and snippets.

@cjohansen
Created October 9, 2023 20:45
Show Gist options
  • Save cjohansen/c4ff8f7f997f654f2af396c55a7e9fde to your computer and use it in GitHub Desktop.
Save cjohansen/c4ff8f7f997f654f2af396c55a7e9fde to your computer and use it in GitHub Desktop.
En bitteliten søkemotor i JavaScript
function groupBy(xs, f) {
var res = {};
(xs || []).forEach(x => {
var k = f ? f(x) : x;
if (!res[k]) {
res[k] = [];
}
res[k].push(x);
});
return res;
}
function countBy(xs, f) {
var res = groupBy(xs, f);
return Object.keys(res).reduce((counts, k) => {
counts[k] = res[k].length;
return counts;
}, {});
}
function flatMap(xs, f) {
return xs.map(f).reduce((r, xs) => r.concat(xs), []);
}
function tokenize(s) {
return s
.toLowerCase()
.split(/ /)
.map(t => t.replace(/[^a-zæøå]+/, ""));
}
function ngram(s, l) {
var tokens = [];
for (var i = 0; i <= s.length - l; i++) {
tokens.push(s.substring(i, i + l));
}
return tokens;
}
function edgeNgram(s, minL, maxL) {
var tokens = [];
for (var i = minL; i <= Math.min(maxL, s.length); i++) {
tokens.push(s.substring(0, i));
}
return tokens;
}
function tokenizeNgrams(s, l) {
return tokenize(s)
.map(token => ngram(token, l))
.reduce((res, tokens) => res.concat(tokens), []);
}
function tokenizeEdgeNgrams(s, minL, maxL) {
return edgeNgram(s.toLowerCase(), minL, maxL);
}
function indexDocument(index, id, tokens) {
return tokens.reduce((idx, token) => {
if (!idx[token]) {
idx[token] = [];
}
idx[token].push(id);
return idx;
}, index || {});
}
function indexDocuments(data, tokenizer) {
return data.reduce((idx, doc) => {
return indexDocument(idx, doc.id, tokenizer(doc));
}, {});
}
function lookupToken(index, token) {
const hitsById = countBy(index[token]);
return Object.keys(hitsById)
.map(id => ({
id: id,
token: token,
score: hitsById[id]
}));
}
function isRelevantResult(results, n) {
return results.length >= n;
}
function getScoredResult(id, results, boost) {
return {
id: id,
score: boost * results.reduce((score, r) => score += r.score, 0)
};
}
function search({index, requiredMatches, tokenizer, boost}, q) {
var tokens = tokenizer(q);
var hits = flatMap(tokens, t => lookupToken(index, t));
var results = groupBy(hits, r => r.id);
var n = Math.floor(tokens.length * requiredMatches || 1);
return Object.keys(results)
.filter(r => isRelevantResult(results[r], n))
.map(id => getScoredResult(id, results[id], boost || 1))
.toSorted((a, b) => b.score - a.score);
}
function searchAll({queries}, q) {
var results = groupBy(
flatMap(queries, config => search(config, q)),
r => r.id
);
return Object.keys(results)
.map(id => getScoredResult(id, results[id], 1));
}
var data = [
{
id: "a1",
title: "Klovneforskningens Lysende Fremtid",
description: "Om klovneforsknings spennende innsikt i fremtidens humor."
},
{
id: "a2",
title: "De Komiske Føttenes Rolle i Humor",
description: "Hvordan klovneføtter påvirker vår opplevelse av humor."
},
{
id: "a3",
title: "Klovneforskningens Overraskende Oppdagelser",
description: "Oppdag de uventede funnene som klovneforskere har gjort."
}
];
var index = {
title: indexDocuments(data, a => tokenize(a.title)),
description: indexDocuments(data, a => tokenize(a.description)),
titleNgram: indexDocuments(data, a => tokenizeNgrams(a.title, 2)),
titleEdgeGrams: indexDocuments(data, a => tokenizeEdgeNgrams(a.title, 2, 15))
};
function prn(x) {
console.log(JSON.stringify(x, null, 2));
}
function withTitle(r) {
r.title = data.filter(a => a.id === r.id)[0].title;
return r;
}
prn(
searchAll({
queries: [
{
index: index.titleNgram,
requiredMatches: 0.8,
tokenizer: s => tokenizeNgrams(s, 2)
},
{
index: index.title,
tokenizer: tokenize,
boost: 20
},
{
index: index.description,
tokenizer: tokenize,
boost: 5
}
]
}, "forske humor")
.map(withTitle)
)
prn(
searchAll({
queries: [
{
index: index.titleEdgeGrams,
tokenizer: s => tokenizeEdgeNgrams(s, 2, 15)
}
]
}, "de ko")
.map(withTitle)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment