Skip to content

Instantly share code, notes, and snippets.

@jenrik
Created September 21, 2015 14:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jenrik/7cb3426e1cedc489fe54 to your computer and use it in GitHub Desktop.
Save jenrik/7cb3426e1cedc489fe54 to your computer and use it in GitHub Desktop.
Danish support in lunr.js

This library contains a danish word stemmer and stopword list for lunr.js. You use this library by including it in a script tag after the lunr.js script tag.

/**
* Adds danish support to lunr.js
* Created by jenrik
*
* Stemmer algorithm based on http://snowball.tartarus.org/algorithms/danish/stemmer.html
* Stopword list originally from http://www.ranks.nl/stopwords/danish
*/
var lunrDanish = {
"s": {
"a": "a",
"b": "b",
"c": "c",
"d": "d",
"f": "f",
"g": "g",
"h": "h",
"j": "j",
"k": "k",
"l": "l",
"m": "m",
"n": "n",
"o": "o",
"p": "p",
"r": "r",
"t": "t",
"v": "v",
"y": "y",
"z": "z",
"å": "å"
},
c: {
"b": "b",
"c": "c",
"d": "d",
"f": "f",
"g": "g",
"h": "h",
"j": "j",
"k": "k",
"l": "l",
"m": "m",
"n": "n",
"p": "p",
"q": "q",
"r": "r",
"s": "s",
"t": "t",
"v": "v",
"w": "w",
"x": "x",
"z": "z",
},
"1": [
"e",
"en",
"er",
"es",
"et",
"hed",
"ene",
"ere",
"ens",
"ers",
"ets",
"eret",
"erer",
"ered",
"erne",
"ende",
"eren",
"heds",
"enes",
"eres",
"ethed",
"erede",
"heder",
"heden",
"endes",
"ernes",
"erens",
"erets",
"erende",
"hedens",
"erendes",
],
"2": ["gd", "dt", "gt", "kt"],
"3": ["ig", "lig", "els", "elig"]
};
lunrDanish.stememr1 = function stemmer1(word) {
var i = sets["1"].length+1;
while(i-=1) {
var t = sets["1"][i-1];
if(endsWith(word, t)) {
return word.substring(0, word.length - t.length);
}
}
if(word.charAt(word.length-1) === "s") {
var l = word.charAt(word.length-2);
if(sets["s"][l] === l) {
return word.substring(0, word.length-2);
}
}
return word;
}
lunrDanish.stememr2 = function stemmer2(word) {
var i = sets["2"].length+1;
while(i-=1) {
if(endsWith(word, sets["2"][i-1])) {
return word.substring(0, word.length-2);
}
}
return word;
}
lunrDanish.stememr3 = function stemmer3(word) {
if(endsWith(word, "igst")) {
word = word.substring(0, word.length-3);
}
if(endsWith(word, "løst")) {
return word.substring(0, word.length-2);
}
var i = sets["3"].length+1;
while(i-=1) {
var t = sets["3"][i-1];
if(endsWith(word, t)) {
var o = word.substring(0, word.length - t.length);
return stemmer2(o);
}
}
return word;
}
lunrDanish.stememr4 = function stemmer4(word) {
var l2 = word.charAt(0, word.length-1);
var l1 = word.charAt(0, word.length-2);
if (l1 == l2 && sets["c"][l1] === l1) {
return word.substring(0, word.length-1);
}
return word;
}
lunrDanish.stemmer = function danishStemmer(token) {
return stemmer4(stemmer3(stemmer2(stemmer1(token))))
}
lunr.Pipeline.registerFunction(lunrDanish.stemmer, "stemmer");
lunr.stopWordFilter.stopWords = {
"af": "af",
"alle": "alle",
"andet": "andet",
"andre": "andre",
"at": "at",
"begge": "begge",
"da": "da",
"de": "de",
"den": "den",
"denne": "denne",
"der": "der",
"deres": "deres",
"det": "det",
"dette": "dette",
"dig": "dig",
"din": "din",
"dog": "dog",
"du": "du",
"ej": "ej",
"eller": "eller",
"en": "en",
"end": "end",
"ene": "ene",
"eneste": "eneste",
"enhver": "enhver",
"et": "et",
"fem": "fem",
"fire": "fire",
"flere": "flere",
"fleste": "fleste",
"for": "for",
"fordi": "fordi",
"forrige fra": "forrige fra",
"få": "få",
"før": "før",
"god": "god",
"han": "han",
"hans": "hans",
"har": "har",
"hendes": "hendes",
"her": "her",
"hun": "hun",
"hvad": "hvad",
"hvem": "hvem",
"hver": "hver",
"hvilken": "hvilken",
"hvis": "hvis",
"hvor": "hvor",
"hvordan": "hvordan",
"hvorfor": "hvorfor",
"hvornår": "hvornår",
"i": "i",
"ikke": "ikke",
"ind": "ind",
"ingen": "ingen",
"intet": "intet",
"jeg": "jeg",
"jeres": "jeres",
"kan": "kan",
"kom": "kom",
"kommer": "kommer",
"lav": "lav",
"lidt": "lidt",
"lille": "lille",
"man mand": "man mand",
"mange": "mange",
"med": "med",
"meget": "meget",
"men": "men",
"mens": "mens",
"mere": "mere",
"mig": "mig",
"ned": "ned",
"ni": "ni",
"nogen": "nogen",
"noget": "noget",
"ny": "ny",
"nyt": "nyt",
"nær": "nær",
"næste": "næste",
"næsten": "næsten",
"og": "og",
"op": "op",
"otte": "otte",
"over": "over",
"på": "på",
"se": "se",
"seks": "seks",
"ses": "ses",
"som": "som",
"stor": "stor",
"store": "store",
"syv": "syv",
"ti": "ti",
"til": "til",
"to": "to",
"tre": "tre",
"ud": "ud",
"var": "var"
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment