Skip to content

Instantly share code, notes, and snippets.

@BriSeven
Last active November 21, 2017 00:18
Show Gist options
  • Save BriSeven/2584447 to your computer and use it in GitHub Desktop.
Save BriSeven/2584447 to your computer and use it in GitHub Desktop.
Ngram utils
define("trigrams", ["underscore"], function(U) {
var trigramIndex = {};
var ngramcount = {
sort () {
var keys = U.keys(this.table),
g = this.table,
sorted;
keys = U.filter(keys, function (o) {
return typeof g[o] !== 'string';
});
sorted = U.map(keys, function (o) {
return [o, g[o]];
}).sort(function (a, b) {
return b[1] - a[1];
});
return sorted;
},
toString (n) {
var sorted = this.sort().map(function (o) {
return o.join(":");
});
n = n || sorted.length;
sorted.length = n;
return sorted.join("\n");
}
}
String.prototype.ngramcount = function (n, raw) {
var s = !raw ? ("_" + this.toLowerCase().replace(/[^a-z0-9]+/g, "_") + "_") : this,
o = {},
i, g,
l = s.length,
ld = s.length - n + 1,
ng;
for (i = 0; i < ld; i += 1) {
g = s.substr(i, n);
o[g] = !isNaN(o[g]) ? o[g] + 1 : 1;
}
ng = Object.create(ngramcount);
ng.length = ld;
ng.count = n;
ng.table = o;
return ng;
};
return {
ngramcount,
indexTrigrams (params) {
var i, l, keys, j, m, trigrams,idx;
idx = trigramIndex;
idx.words = idx.words || {};
idx.trigrams = idx.trigrams || {};
U.forEach(params.data, function(o) {
var i, j, words = [o.seriesTitle, o.seriesTitle, o.seriesTitle, o.seriesTitle, o.seriesDescription, o.keywords].join(" ");
words += U.map(o.episodes, function(episode) {
return [episode.title, episode.description].join(" ")
}).join(" ");
words = words.toLowerCase().split(/[^a-z0-9]+/);
words = U.filter(words, function(o) {
return !/^(the|with|and|for|from)$/.test(o) && o.length > 1
});
for (i = 0; i < words.length; i += 1) {
idx.words[words[i]] = idx.words[words[i]] || {};
idx.words[words[i]][o.seriesId] = idx.words[words[i]][o.seriesId] || 0;
idx.words[words[i]][o.seriesId] += 1;
trigrams = words[i].ngramcount(3).sort();
for (j = 0; j < trigrams.length; j += 1) {
idx.trigrams[trigrams[j][0]] = idx.trigrams[trigrams[j][0]] || {};
idx.trigrams[trigrams[j][0]][words[i]] = idx.trigrams[trigrams[j][0]][words[i]] || 0;
idx.trigrams[trigrams[j][0]][words[i]] += 1
}
}
});
return idx;
},
searchByTrigrams (params) {
var words, ti, si, wordresults, wordscore, rankedSeries, ranked;
var seriesPrefix=params.seriesPrefix;
function scoreWords(o) {
var score = {},
trigrams = o.ngramcount(3).sort(),
query = U.pluck(trigrams, 0),
ranked;
function scoreTrigrams(o) {
var wordlist = ti.trigrams[o] || {},
i;
for (i in wordlist) {
if (wordlist.hasOwnProperty(i)) {
score[i] = score[i] || 0;
score[i] += 1
}
}
}
U.forEach(query, scoreTrigrams);
//sort by score.
ranked = U.zip(U.keys(score), U.values(score)).sort(function(a, b) {
return b[1] - a[1]
});
return (ranked[0] || [])[0] || ""
}
if (trigramIndex && trigramIndex.trigrams) {
if (typeof params.search !== "string") {
return false
}
words = params.search.toLowerCase().split(/[^a-z0-9]+/);
words = U.filter(words, function(o) {
return !/^(the|with|and|for|from)$/.test(o) && o.length > 1
});
ti = trigramIndex;
si = params.seriesIdIndex;
wordresults = U.map(words, scoreWords);
wordscore = {};
U.forEach(wordresults, function(o) {
var words = ti.words[o],
i;
for (i in words) {
if (words.hasOwnProperty(i)) {
wordscore[i] = wordscore[i] || 0;
wordscore[i] += words[i]
}
}
});
//sort by score.
ranked = U.zip(U.keys(wordscore), U.values(wordscore)).sort(function(a, b) {
return b[1] - a[1]
});
ranked = U.pluck(ranked, 0);
rankedSeries = U.map(ranked, function(o) {
return si[seriesPrefix + o]
});
if (params.channel) {
rankedSeries = U.filter(rankedSeries, function(o) {
return o && o.channel === params.channel
})
}
return {
results: rankedSeries,
safeSearch: wordresults.join(" ")
}
} else {}
}
}
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment