Skip to content

Instantly share code, notes, and snippets.

@timwhitlock
Last active January 6, 2021 19:01
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save timwhitlock/2876736 to your computer and use it in GitHub Desktop.
Save timwhitlock/2876736 to your computer and use it in GitHub Desktop.
Searchable JavaScript Dictionary
/**
* Searchable JavaScript dictionary.
*
* Usage:
* var dict = require('path/to/this').create();
* dict.add( 'somedata', 'Text to index' );
* var items = dict.find('text');
*/
exports.create = function(){
return new Dict;
}
/**
* Dictionary constructor
*/
function Dict(){
var tree, data;
// public access to private vars
this.clear = function(){
this.length = 0;
tree = {};
data = [];
};
this.getTree = function(){
return tree;
};
this.getData = function(){
return data;
};
this.clear();
}
var proto = Dict.prototype;
/**
* Maximum depth of search tree
* @var Number
*/
proto.depth = 0;
/**
* Whether result require all words in query match,
* @var Boolean
*/
proto.matchall = true;
/**
* Whether to normalize all words to lower case
* @var Boolean
*/
proto.ignorecase = true;
/**
* Default expression to strip out of words
* @var RegExp
*/
proto.nonword = /[\-.?!;:,_*^+=~`"'(){}<>[\]\/\\\u00a0\u1680\u180e\u2000-\u206f\u2e00-\u2e7f\u3000-\u303f]+/g;
/**
* Configure custom transliteration table
* @param Object
* @param RegExp
*/
proto.translit = function( charTable, charMatch ){
charMatch = charMatch || /[^a-z0-9]/g;
function replacer( chr ){
return charTable[chr] || chr;
}
// add transliterator function to call custom replacer
this.trans = function( word ){
return word.replace( charMatch, replacer );
};
}
/**
* Configure custom stop words to match after normalisation
* @param Object
*/
proto.stoppers = function( wordsTable ){
this.stopped = function( word ){
return Boolean( wordsTable[word] );
};
}
/**
* @param string arbitrary data being looked up
* @param string text index to search for data if different from data
* @return Dict
*/
proto.add = function( data, text ){
// fall back to data as index, if not passing custom text index
if( null == text ){
text = String( data );
}
var w = -1, word, leaf, branch, depth, c, chr,
words = this.normalize( text ),
store = this.getData(),
idx = store.length;
// store data and increment number of records
store.push( data );
this.length ++;
// index individual words
while( ++w < words.length ){
word = words[w];
if( this.stopped(word) ){
continue;
}
// start at root of tree and index word to depth
branch = this.getTree();
depth = Math.min( word.length, this.depth ) || word.length;
for( c = 0; c < depth; c++ ){
chr = word.charAt(c);
branch = branch[chr] || ( branch[chr] = {} );
}
leaf = branch[' '] || ( branch[' '] = [] );
leaf.push( idx );
}
return this;
}
/**
* Lookup data from text
* @param string one or more words to search index
* @return array unique list of found data items
*/
proto.find = function( text, meta ){
var w = -1, word, branch, depth, c, chr, idx,
matches = {},
results = [],
words = this.normalize( text ),
store = this.getData();
// recursive function drills into matched branches
function descend( branch, word ){
var i, match, node;
for( chr in branch ){
node = branch[chr];
if( ' ' === chr ){
// end of branch - have data
for( i in node ){
idx = node[i];
match = matches[idx] || ( matches[idx] = { length: 0, words: {} } );
match.length += match.words[word] ? 0 : 1;
match.words[word] = 1 + ( match.words[word] || 0 );
}
continue;
}
// else descend into all paths
descend( node, word );
}
}
// search on all words passed and count matches
wordloop:
while( ++w < words.length ){
word = words[w];
branch = this.getTree();
depth = Math.min( word.length, this.depth ) || word.length;
// drill into dictionary as far as this partial word will go
for( c = 0; c < depth; c++ ){
chr = word.charAt(c);
if( ! branch[chr] ){
// overshot branch word not found
continue wordloop;
}
branch = branch[chr];
}
// word reaches valid point in tree - collect all data under this branch
descend( branch, word );
}
// return unique data matches
for( idx in matches ){
if( this.matchall && ( matches[idx].length < words.length ) ){
continue;
}
results.push( store[idx] );
}
// allow callee access to full info
if( meta ){
meta.query = text;
meta.words = words;
}
return results;
}
/**
* utility for normalizing a string into matchable words
* @param string raw string
* @return array unique normalized words
*/
proto.normalize = function( str ){
var i = -1,
word,
unique = {},
words = [],
parts = this.split(str);
while( ++i < parts.length ){
word = parts[i];
if( ! word ){
continue;
}
// convert case first so table can be smaller
if( this.ignorecase ){
word = word.toLowerCase();
}
// strip ignore list, assuming everything else is significant
word = this.strip( word );
if( ! word ){
continue;
}
// transliterate via custom mappings
if( this.trans ){
word = this.trans( word );
}
if( unique[word] ){
continue;
}
words.push( word );
unique[word] = true;
}
return words;
}
/**
* default stop word checker, can be overridden with this.stops( table )
*/
proto.stopped = function( word ){
return 1 === word.length;
}
/**
* split a sentence into rough word chunks
*/
proto.split = function( str ){
return str.split(/\s+/);
}
/**
* strip ignorable characters from a word
*/
proto.strip = function( word ){
return word.replace( this.nonword, '' );
}
/**
* Debug function for inspecting search tree and indexed data
*
proto.dump = function(){
var store = this.getData();
function collect( leaf ){
var i = -1, data = [];
while( ++i < leaf.length ){
data.push( store[ leaf[i] ] );
}
return data;
}
function descend( branch, word ){
var chr, node;
for( chr in branch ){
node = branch[chr];
if( ' ' === chr ){
console.log( word+': [ '+collect(node).join(', ')+' ]' );
}
else {
//word && console.log( word );
descend( node, word+chr )
}
}
}
descend( this.getTree(), '' );
}
//*/
proto = null;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment