Created
February 9, 2016 10:30
-
-
Save msbarry/8d61a8faf830afef3eee to your computer and use it in GitHub Desktop.
Building/searching a trie in js
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var txt = require("fs").readFileSync("app/names.txt", "utf8"); | |
var nicknames = require('./nicknames'); | |
var words = txt.split("\n"); | |
var trie = build(words); | |
optimize(trie); | |
require('fs').writeFileSync('app/trie.js', 'trie = ' + toString(trie)); | |
function tokenize(str) { | |
return str | |
.toLowerCase() | |
.replace(/[^ a-zA-Z]+/g, '') | |
.split(' ') | |
.filter(function (d) { return d.length > 1; }); | |
} | |
function getWordsFrom(name) { | |
var tokens = tokenize(name); | |
var result = {}; | |
tokens.forEach(function (token, i) { | |
if (i > 0) { | |
nicknames(token).forEach(function (d) { | |
result[d] = true; | |
}); | |
} | |
result[token] = true; | |
}); | |
return Object.keys(result); | |
} | |
function build(words) { | |
var trie = {}; | |
// Build a simple Trie structure | |
for ( var i = 0, l = words.length; i < l; i++ ) { | |
var word = words[i], tokens = getWordsFrom(word); | |
for (var k = 0; k < tokens.length; k++) { | |
var letters = tokens[k].split(""), cur = trie; | |
for ( var j = 0; j < letters.length; j++ ) { | |
var letter = letters[j], pos = cur[ letter ]; | |
if ( pos == null ) { | |
cur = cur[ letter ] = {}; | |
} else { | |
cur = cur[ letter ]; | |
} | |
if (j === letters.length - 1) { | |
cur['$'] = cur['$'] || []; | |
cur['$'].push(i); | |
} | |
} | |
} | |
} | |
return trie; | |
} | |
function toString(trie) { | |
var ret = JSON.stringify( trie ).replace(/"/g, ""); | |
var reserved = [ "abstract", "boolean", "break", "byte", "case", "catch", "char", "class", "const", "continue", "debugger", "default", "delete", "do", "double", "else", "enum", "export", "extends", "false", "final", "finally", "float", "for", "function", "goto", "if", "implements", "import", "in", "instanceof", "int", "interface", "long", "native", "new", "null", "package", "private", "protected", "public", "return", "short", "static", "super", "switch", "synchronized", "this", "throw", "throws", "transient", "true", "try", "typeof", "var", "void", "volatile", "while", "with" ]; | |
for ( var i = 0; i < reserved.length; i++ ) { | |
ret = ret.replace( new RegExp("([{,])(" + reserved[i] + "):", "g"), "$1'$2':" ); | |
} | |
return ret; | |
} | |
function optimize( cur ) { | |
var num = 0, last; | |
for ( var node in cur ) { | |
if ( typeof cur[ node ] === "object" && node !== '$') { | |
var ret = optimize( cur[ node ] ); | |
if ( ret ) { | |
delete cur[ node ]; | |
cur[ node + ret.name ] = ret.value; | |
node = node + ret.name; | |
} | |
} | |
last = node; | |
num++; | |
} | |
if ( num === 1 ) { | |
return { name: last.replace(/\$$/, ''), value: cur[ last ] }; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var now = new Date().getTime(); | |
importScripts('bower_components/underscore/underscore.js'); | |
function tokenize(str) { | |
return str | |
.toLowerCase() | |
.replace(/[^ a-zA-Z]+/g, '') | |
.split(' ') | |
.filter(function (d) { return d.length > 1; }); | |
} | |
importScripts('trie.js'); | |
console.log('Loaded in ' + (new Date().getTime() - now) + 'ms'); | |
self.postMessage({ type: 'ready' }); | |
self.onmessage = function (event) { | |
var query = event.data.query; | |
var results = query.trim().length === 0 ? [] : search(query) | |
.map(function (d) { return parseInt(d); }); | |
self.postMessage({ type: 'results', start: event.data.start, data: results }); | |
}; | |
function tokenize(str) { | |
return str.toLowerCase().replace(/[^ a-zA-Z]+/g, '').split(/\s+/g).filter(function (d) { return d.length > 1; }); | |
} | |
function traverse( cur, other, newOne ) { | |
newOne = newOne || {}; | |
if (Array.isArray(cur)) { | |
for (var i = 0, len = cur.length; i < len; i++) { | |
var val = cur[i]; | |
if (!other || other[val]) { newOne[val] = true; } | |
} | |
} else { | |
for (var node in cur) { | |
var val = cur[node]; | |
var resp = traverse(val, other); | |
for (var n in resp) { | |
if (!other || other[n]) { newOne[n] = true; } | |
} | |
} | |
} | |
return newOne; | |
} | |
function findWord( word, cur, other ) { | |
// Get the root to start from | |
cur = cur || trie; | |
// Go through every leaf | |
for ( var node in cur ) { | |
var val = cur[node]; | |
// If the start of the word matches the leaf | |
if ( word.indexOf( node ) === 0 || node.indexOf(word) === 0) { | |
// If this leaf finishes the word | |
if ( node.length >= word.length ) { | |
return traverse(val, other); | |
} else { | |
return findWord( word.slice( node.length ), val, other ); | |
} | |
} | |
} | |
return {}; | |
} | |
function search(str) { | |
var tokens = tokenize(str); | |
var other; | |
_.sortBy(tokens, 'length').reverse().forEach(function (d) { | |
var result = findWord(d, trie, other); | |
other = result; | |
}); | |
return Object.keys(other || {}); | |
} | |
// search('First Last'); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require('./app/trie'); | |
var u = require('underscore'); | |
function tokenize(str) { | |
return str.toLowerCase().replace(/[^ a-zA-Z]+/g, '').split(/\s+/g).filter(function (d) { return d.length > 1; }); | |
} | |
function traverse( cur ) { | |
var result = []; | |
if (Array.isArray(cur)) { | |
result = result.concat(cur); | |
} else { | |
for (var node in cur) { | |
var val = cur[node]; | |
result = result.concat(traverse(val)); | |
} | |
} | |
return result; | |
} | |
function findWord( word, cur ) { | |
// Get the root to start from | |
cur = cur || trie; | |
// Go through every leaf | |
for ( var node in cur ) { | |
var val = cur[node]; | |
// If the start of the word matches the leaf | |
if ( word.indexOf( node ) === 0 || node.indexOf(word) === 0) { | |
// If this leaf finishes the word | |
if ( node.length >= word.length ) { | |
return traverse(val); | |
} else { | |
return findWord( word.slice( node.length ), val ); | |
} | |
} | |
} | |
return []; | |
} | |
function search(str) { | |
var tokens = tokenize(str); | |
var matches = tokens.map(function (d) { return findWord(d); }); | |
return u.intersection.apply(u, matches); | |
} | |
console.log(search('First Last')); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var worker = new Worker('search_worker.js'); | |
var next = null; | |
var outstanding = false; | |
names = []; | |
worker.onmessage = function (evt) { | |
if (evt.data.type === 'ready') { | |
// allow searching | |
} else { | |
console.log(evt.data.data.length + ' records'); | |
console.log(evt.data.data.slice(0, 10).map(function (d) { return names[d]; })); | |
outstanding = false; | |
if (next) { | |
timeSearch(next); | |
} | |
} | |
} | |
d3.text('names.txt', function (d) { | |
names = d.split('\n'); | |
}); | |
function timeSearch(str) { | |
if (!outstanding) { | |
worker.postMessage(str); | |
next = null; | |
} else { | |
next = str; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment