Skip to content

Instantly share code, notes, and snippets.

@tmcw
Created November 5, 2013 18:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tmcw/7323807 to your computer and use it in GitHub Desktop.
Save tmcw/7323807 to your computer and use it in GitHub Desktop.
iconv = new require('iconv')
.Iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE');
// Converts text into a token ID.
// This is a 28 bit FNV1a with room for 4 bits of room for bonus data.
// This bonus data is currently used by degenerate token mappings to specify
// the character distance of degenerates from original tokens.
// FNV-1a hash.
// For 32-bit: offset = 2166136261, prime = 16777619.
module.exports.fnv1a = fnv1a;
function fnv1a(str) {
var hash = 0x811C9DC5;
if (str.length) for (var i = 0; i < str.length; i++) {
hash = hash ^ str.charCodeAt(i);
// 2**24 + 2**8 + 0x93 = 16777619
hash += (hash << 24) + (hash << 8) + (hash << 7) + (hash << 4) + (hash << 1);
}
return hash >>> 0;
}
// Converts text into a name ID.
// Appends a suffix based on the first term to help cluster phrases in shards.
// @TODO implement this as actual 24-bit FNV1a per http://www.isthe.com/chongo/tech/comp/fnv/
module.exports.phrase = phrase;
function phrase(text) {
var tokens = tokenize(text);
var a = fnvfold(tokens.join(' '), 20);
var b = fnvfold((tokens.length ? tokens[0] : ''), 30) % 4096;
console.log('a', a, 'b', b);
return a * 4096 + b;
}
// XOR fold a FNV-1a hash to n bits
// http://www.isthe.com/chongo/tech/comp/fnv/#xor-fold
function fnvfold(str, bits) {
var mask = (1<<bits >>> 0) - 1;
var hash = fnv1a(str);
console.log('fold', str, 'into', hash, 'folded', ((hash >>> bits) ^ (mask & hash)) >>> 0);
return ((hash >>> bits) ^ (mask & hash)) >>> 0;
}
// Normalize input text into lowercase, asciified tokens.
module.exports.tokenize = tokenize;
function tokenize(query, lonlat) {
if (lonlat) {
var numeric = query
.split(/[^\.\-\d+]+/i)
.filter(function(t) { return t.length; })
.map(function(t) { return parseFloat(t); })
.filter(function(t) { return !isNaN(t); });
if (numeric.length === 2) return numeric;
}
// try {
var converted = iconv.convert(query).toString();
query = converted;
// } catch(err) {}
return query
.toLowerCase()
.replace(/[\^]+/g, '')
.replace(/[-,]+/g, ' ')
.split(/[^\w+^\s+]/gi)
.join('')
.split(/[\s+]+/gi)
.filter(function(t) { return t.length; });
}
var a = 'A. Zápotockého',
b = 'A. Quiroga';
console.log('FNV:', a + '\n');
console.log('result:', phrase(a) + '\n');
console.log('FNV:', b + '\n');
console.log('result:', phrase(b));
FNV: A. Zápotockého
fold a zapotockeho into 1360009443 folded 7666
fold a into 3826002220 folded 604776751
a 7666 b 2351
result: 31402287
FNV: A. Quiroga
fold a quiroga into 549462014 folded 7666
fold a into 3826002220 folded 604776751
a 7666 b 2351
result: 31402287
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment