Skip to content

Instantly share code, notes, and snippets.

@msbarry
Created February 9, 2016 10:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save msbarry/8d61a8faf830afef3eee to your computer and use it in GitHub Desktop.
Save msbarry/8d61a8faf830afef3eee to your computer and use it in GitHub Desktop.
Building/searching a trie in js
var txt = require("fs").readFileSync("app/names.txt", "utf8");
var nicknames = require('./nicknames');
var words = txt.split("\n");
var trie = build(words);
optimize(trie);
require('fs').writeFileSync('app/trie.js', 'trie = ' + toString(trie));
function tokenize(str) {
return str
.toLowerCase()
.replace(/[^ a-zA-Z]+/g, '')
.split(' ')
.filter(function (d) { return d.length > 1; });
}
function getWordsFrom(name) {
var tokens = tokenize(name);
var result = {};
tokens.forEach(function (token, i) {
if (i > 0) {
nicknames(token).forEach(function (d) {
result[d] = true;
});
}
result[token] = true;
});
return Object.keys(result);
}
function build(words) {
var trie = {};
// Build a simple Trie structure
for ( var i = 0, l = words.length; i < l; i++ ) {
var word = words[i], tokens = getWordsFrom(word);
for (var k = 0; k < tokens.length; k++) {
var letters = tokens[k].split(""), cur = trie;
for ( var j = 0; j < letters.length; j++ ) {
var letter = letters[j], pos = cur[ letter ];
if ( pos == null ) {
cur = cur[ letter ] = {};
} else {
cur = cur[ letter ];
}
if (j === letters.length - 1) {
cur['$'] = cur['$'] || [];
cur['$'].push(i);
}
}
}
}
return trie;
}
function toString(trie) {
var ret = JSON.stringify( trie ).replace(/"/g, "");
var reserved = [ "abstract", "boolean", "break", "byte", "case", "catch", "char", "class", "const", "continue", "debugger", "default", "delete", "do", "double", "else", "enum", "export", "extends", "false", "final", "finally", "float", "for", "function", "goto", "if", "implements", "import", "in", "instanceof", "int", "interface", "long", "native", "new", "null", "package", "private", "protected", "public", "return", "short", "static", "super", "switch", "synchronized", "this", "throw", "throws", "transient", "true", "try", "typeof", "var", "void", "volatile", "while", "with" ];
for ( var i = 0; i < reserved.length; i++ ) {
ret = ret.replace( new RegExp("([{,])(" + reserved[i] + "):", "g"), "$1'$2':" );
}
return ret;
}
function optimize( cur ) {
var num = 0, last;
for ( var node in cur ) {
if ( typeof cur[ node ] === "object" && node !== '$') {
var ret = optimize( cur[ node ] );
if ( ret ) {
delete cur[ node ];
cur[ node + ret.name ] = ret.value;
node = node + ret.name;
}
}
last = node;
num++;
}
if ( num === 1 ) {
return { name: last.replace(/\$$/, ''), value: cur[ last ] };
}
}
var now = new Date().getTime();
importScripts('bower_components/underscore/underscore.js');
function tokenize(str) {
return str
.toLowerCase()
.replace(/[^ a-zA-Z]+/g, '')
.split(' ')
.filter(function (d) { return d.length > 1; });
}
importScripts('trie.js');
console.log('Loaded in ' + (new Date().getTime() - now) + 'ms');
self.postMessage({ type: 'ready' });
self.onmessage = function (event) {
var query = event.data.query;
var results = query.trim().length === 0 ? [] : search(query)
.map(function (d) { return parseInt(d); });
self.postMessage({ type: 'results', start: event.data.start, data: results });
};
function tokenize(str) {
return str.toLowerCase().replace(/[^ a-zA-Z]+/g, '').split(/\s+/g).filter(function (d) { return d.length > 1; });
}
function traverse( cur, other, newOne ) {
newOne = newOne || {};
if (Array.isArray(cur)) {
for (var i = 0, len = cur.length; i < len; i++) {
var val = cur[i];
if (!other || other[val]) { newOne[val] = true; }
}
} else {
for (var node in cur) {
var val = cur[node];
var resp = traverse(val, other);
for (var n in resp) {
if (!other || other[n]) { newOne[n] = true; }
}
}
}
return newOne;
}
function findWord( word, cur, other ) {
// Get the root to start from
cur = cur || trie;
// Go through every leaf
for ( var node in cur ) {
var val = cur[node];
// If the start of the word matches the leaf
if ( word.indexOf( node ) === 0 || node.indexOf(word) === 0) {
// If this leaf finishes the word
if ( node.length >= word.length ) {
return traverse(val, other);
} else {
return findWord( word.slice( node.length ), val, other );
}
}
}
return {};
}
function search(str) {
var tokens = tokenize(str);
var other;
_.sortBy(tokens, 'length').reverse().forEach(function (d) {
var result = findWord(d, trie, other);
other = result;
});
return Object.keys(other || {});
}
// search('First Last');
require('./app/trie');
var u = require('underscore');
function tokenize(str) {
return str.toLowerCase().replace(/[^ a-zA-Z]+/g, '').split(/\s+/g).filter(function (d) { return d.length > 1; });
}
function traverse( cur ) {
var result = [];
if (Array.isArray(cur)) {
result = result.concat(cur);
} else {
for (var node in cur) {
var val = cur[node];
result = result.concat(traverse(val));
}
}
return result;
}
function findWord( word, cur ) {
// Get the root to start from
cur = cur || trie;
// Go through every leaf
for ( var node in cur ) {
var val = cur[node];
// If the start of the word matches the leaf
if ( word.indexOf( node ) === 0 || node.indexOf(word) === 0) {
// If this leaf finishes the word
if ( node.length >= word.length ) {
return traverse(val);
} else {
return findWord( word.slice( node.length ), val );
}
}
}
return [];
}
function search(str) {
var tokens = tokenize(str);
var matches = tokens.map(function (d) { return findWord(d); });
return u.intersection.apply(u, matches);
}
console.log(search('First Last'));
var worker = new Worker('search_worker.js');
var next = null;
var outstanding = false;
names = [];
worker.onmessage = function (evt) {
if (evt.data.type === 'ready') {
// allow searching
} else {
console.log(evt.data.data.length + ' records');
console.log(evt.data.data.slice(0, 10).map(function (d) { return names[d]; }));
outstanding = false;
if (next) {
timeSearch(next);
}
}
}
d3.text('names.txt', function (d) {
names = d.split('\n');
});
function timeSearch(str) {
if (!outstanding) {
worker.postMessage(str);
next = null;
} else {
next = str;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment