Created
October 9, 2023 20:45
-
-
Save cjohansen/c4ff8f7f997f654f2af396c55a7e9fde to your computer and use it in GitHub Desktop.
En bitteliten søkemotor i JavaScript
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function groupBy(xs, f) { | |
var res = {}; | |
(xs || []).forEach(x => { | |
var k = f ? f(x) : x; | |
if (!res[k]) { | |
res[k] = []; | |
} | |
res[k].push(x); | |
}); | |
return res; | |
} | |
function countBy(xs, f) { | |
var res = groupBy(xs, f); | |
return Object.keys(res).reduce((counts, k) => { | |
counts[k] = res[k].length; | |
return counts; | |
}, {}); | |
} | |
function flatMap(xs, f) { | |
return xs.map(f).reduce((r, xs) => r.concat(xs), []); | |
} | |
function tokenize(s) { | |
return s | |
.toLowerCase() | |
.split(/ /) | |
.map(t => t.replace(/[^a-zæøå]+/, "")); | |
} | |
function ngram(s, l) { | |
var tokens = []; | |
for (var i = 0; i <= s.length - l; i++) { | |
tokens.push(s.substring(i, i + l)); | |
} | |
return tokens; | |
} | |
function edgeNgram(s, minL, maxL) { | |
var tokens = []; | |
for (var i = minL; i <= Math.min(maxL, s.length); i++) { | |
tokens.push(s.substring(0, i)); | |
} | |
return tokens; | |
} | |
function tokenizeNgrams(s, l) { | |
return tokenize(s) | |
.map(token => ngram(token, l)) | |
.reduce((res, tokens) => res.concat(tokens), []); | |
} | |
function tokenizeEdgeNgrams(s, minL, maxL) { | |
return edgeNgram(s.toLowerCase(), minL, maxL); | |
} | |
function indexDocument(index, id, tokens) { | |
return tokens.reduce((idx, token) => { | |
if (!idx[token]) { | |
idx[token] = []; | |
} | |
idx[token].push(id); | |
return idx; | |
}, index || {}); | |
} | |
function indexDocuments(data, tokenizer) { | |
return data.reduce((idx, doc) => { | |
return indexDocument(idx, doc.id, tokenizer(doc)); | |
}, {}); | |
} | |
function lookupToken(index, token) { | |
const hitsById = countBy(index[token]); | |
return Object.keys(hitsById) | |
.map(id => ({ | |
id: id, | |
token: token, | |
score: hitsById[id] | |
})); | |
} | |
function isRelevantResult(results, n) { | |
return results.length >= n; | |
} | |
function getScoredResult(id, results, boost) { | |
return { | |
id: id, | |
score: boost * results.reduce((score, r) => score += r.score, 0) | |
}; | |
} | |
function search({index, requiredMatches, tokenizer, boost}, q) { | |
var tokens = tokenizer(q); | |
var hits = flatMap(tokens, t => lookupToken(index, t)); | |
var results = groupBy(hits, r => r.id); | |
var n = Math.floor(tokens.length * requiredMatches || 1); | |
return Object.keys(results) | |
.filter(r => isRelevantResult(results[r], n)) | |
.map(id => getScoredResult(id, results[id], boost || 1)) | |
.toSorted((a, b) => b.score - a.score); | |
} | |
function searchAll({queries}, q) { | |
var results = groupBy( | |
flatMap(queries, config => search(config, q)), | |
r => r.id | |
); | |
return Object.keys(results) | |
.map(id => getScoredResult(id, results[id], 1)); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var data = [ | |
{ | |
id: "a1", | |
title: "Klovneforskningens Lysende Fremtid", | |
description: "Om klovneforsknings spennende innsikt i fremtidens humor." | |
}, | |
{ | |
id: "a2", | |
title: "De Komiske Føttenes Rolle i Humor", | |
description: "Hvordan klovneføtter påvirker vår opplevelse av humor." | |
}, | |
{ | |
id: "a3", | |
title: "Klovneforskningens Overraskende Oppdagelser", | |
description: "Oppdag de uventede funnene som klovneforskere har gjort." | |
} | |
]; | |
var index = { | |
title: indexDocuments(data, a => tokenize(a.title)), | |
description: indexDocuments(data, a => tokenize(a.description)), | |
titleNgram: indexDocuments(data, a => tokenizeNgrams(a.title, 2)), | |
titleEdgeGrams: indexDocuments(data, a => tokenizeEdgeNgrams(a.title, 2, 15)) | |
}; | |
function prn(x) { | |
console.log(JSON.stringify(x, null, 2)); | |
} | |
function withTitle(r) { | |
r.title = data.filter(a => a.id === r.id)[0].title; | |
return r; | |
} | |
prn( | |
searchAll({ | |
queries: [ | |
{ | |
index: index.titleNgram, | |
requiredMatches: 0.8, | |
tokenizer: s => tokenizeNgrams(s, 2) | |
}, | |
{ | |
index: index.title, | |
tokenizer: tokenize, | |
boost: 20 | |
}, | |
{ | |
index: index.description, | |
tokenizer: tokenize, | |
boost: 5 | |
} | |
] | |
}, "forske humor") | |
.map(withTitle) | |
) | |
prn( | |
searchAll({ | |
queries: [ | |
{ | |
index: index.titleEdgeGrams, | |
tokenizer: s => tokenizeEdgeNgrams(s, 2, 15) | |
} | |
] | |
}, "de ko") | |
.map(withTitle) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment