Skip to content

Instantly share code, notes, and snippets.

@mtourne
Last active August 29, 2015 14:05
Show Gist options
  • Save mtourne/f7e1e05fc64fc9f5cbb3 to your computer and use it in GitHub Desktop.
Save mtourne/f7e1e05fc64fc9f5cbb3 to your computer and use it in GitHub Desktop.
top k in Javascript
function topk_words(content, nbItems) {
var space_saver = new_space_saver(nbItems * 2);
var words = content.toLowerCase().replace(/\W/g, ' ').split(/\s+/);
for (var i = 0; i < words.length; i++) {
insert_space_saver(space_saver, words[i]);
}
return get_top_space_saver(space_saver, nbItems);
}
// create a new k element space saver
function new_space_saver(nbItems) {
var scores = {
// we'll be storing token, score (counter) and card (cardinality)
// for each element
// - scores.token[<token>] can access the struct by token
// - scores.card[<card>] can access the struct by card
token : {},
card : [],
size : nbItems,
overrestimates: {}
};
// initiate all counters to 0
for (var i = 0; i < nbItems; i++) {
scores.card[i] = {
score: 0,
token: null,
card: i
};
}
return scores;
}
// swap two scores based on cardinality in the set
function swap_scores_space_saver(scores, card1, card2) {
var t1 = scores.card[card1];
var t2 = scores.card[card2];
// swap the cardinalities
t1.card = card2;
t2.card = card1;
scores.card[card1] = t2;
scores.card[card2] = t1;
}
// add +1 to a token in the space saver
// Note: adding +n is a separate case
// (would require insertion in sorted list)
// Here, at most we'll do 1 swap
function inc_space_saver(scores, t) {
var new_score = t.score + 1;
t.score = new_score;
var swap_from = t.card;
var swap_to = t.card + 1;
var next_t = scores.card[swap_to];
var next_next_t;
if (next_t && new_score > next_t.score) {
next_next_t = scores.card[swap_to + 1];
// skip equal elements to minimize number of swaps
while (next_next_t && next_t.score == next_next_t.score) {
swap_to = swap_to + 1;
next_next_t = scores.card[swap_to + 1];
}
swap_scores_space_saver(scores, swap_from, swap_to);
return swap_to;
}
// no swap
return swap_from;
}
function insert_space_saver(scores, token) {
var t = scores.token[token];
if (t) {
// token is already present, inc by 1
// console.log(t);
inc_space_saver(scores, t);
} else {
// get t1, the cardinality-1 score (lowest score)
var t1 = scores.card[0];
if (t1.token) {
// remove reference to old token
scores.token[t1.token] = null;
scores.overrestimates[t1.token] = null;
}
// set new token
t1.token = token;
scores.token[token] = t1;
// overrestimate is old score
scores.overrestimates[token] = t1.score;
// inc t1's score by 1
inc_space_saver(scores, t1);
}
}
// TODO: only return the tokens above a certain
// confidence factor (low overrestimates)
function get_top_space_saver(scores, max) {
var card = scores.card
var output = []
for (var i = card.length - 1; i >= 0 && max > 0; i--, max--) {
output.push(card[i].token);
}
return output;
}
//// TESTING ////
var PRIMES = [
7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59,
61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131,
137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197,
199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271,
277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353,
359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433,
439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509,
521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593, 599, 601,
607, 613, 617, 619, 631, 641, 643, 647, 653, 659, 661, 673, 677,
683, 691, 701, 709, 719, 727, 733, 739, 743, 751, 757, 761, 769,
773, 787, 797, 809, 811, 821, 823, 827, 829, 839, 853, 857, 859,
863, 877, 881, 883, 887, 907, 911, 919, 929, 937, 941, 947, 953,
967, 971, 977, 983, 991, 997 ]
function get_prime(num) {
var i = PRIMES.length;
var prime;
while (i >= 0) {
prime = PRIMES[i]
if (num % prime == 0) {
return prime
}
i = i - 1
}
return num
}
// testing
var space_saver = new_space_saver(PRIMES.length * 2);
var ITERATION = 1000000;
for (var i = 0; i < ITERATION; i++) {
var num = Math.floor((Math.random() * 100000000) + 1);
num = get_prime(num);
// console.log(num.toString());
insert_space_saver(space_saver, num.toString());
}
var topk = get_top_space_saver(space_saver, PRIMES.length);
var i;
for (i = 0; i < PRIMES.length; i++) {
if (topk[i] != PRIMES[i].toString()) {
break;
}
}
console.log("Primes are good until: " + i + "; Max :" + (PRIMES.length - 1));
var topk_words = topk_words("blah titi blah toto blah titi titi toto foo", 3);
console.log(topk_words);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment