Created
November 5, 2013 18:36
-
-
Save tmcw/7323807 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
iconv = new require('iconv') | |
.Iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE'); | |
// Converts text into a token ID. | |
// This is a 28 bit FNV1a with room for 4 bits of room for bonus data. | |
// This bonus data is currently used by degenerate token mappings to specify | |
// the character distance of degenerates from original tokens. | |
// FNV-1a hash. | |
// For 32-bit: offset = 2166136261, prime = 16777619. | |
module.exports.fnv1a = fnv1a; | |
function fnv1a(str) { | |
var hash = 0x811C9DC5; | |
if (str.length) for (var i = 0; i < str.length; i++) { | |
hash = hash ^ str.charCodeAt(i); | |
// 2**24 + 2**8 + 0x93 = 16777619 | |
hash += (hash << 24) + (hash << 8) + (hash << 7) + (hash << 4) + (hash << 1); | |
} | |
return hash >>> 0; | |
} | |
// Converts text into a name ID. | |
// Appends a suffix based on the first term to help cluster phrases in shards. | |
// @TODO implement this as actual 24-bit FNV1a per http://www.isthe.com/chongo/tech/comp/fnv/ | |
module.exports.phrase = phrase; | |
function phrase(text) { | |
var tokens = tokenize(text); | |
var a = fnvfold(tokens.join(' '), 20); | |
var b = fnvfold((tokens.length ? tokens[0] : ''), 30) % 4096; | |
console.log('a', a, 'b', b); | |
return a * 4096 + b; | |
} | |
// XOR fold a FNV-1a hash to n bits | |
// http://www.isthe.com/chongo/tech/comp/fnv/#xor-fold | |
function fnvfold(str, bits) { | |
var mask = (1<<bits >>> 0) - 1; | |
var hash = fnv1a(str); | |
console.log('fold', str, 'into', hash, 'folded', ((hash >>> bits) ^ (mask & hash)) >>> 0); | |
return ((hash >>> bits) ^ (mask & hash)) >>> 0; | |
} | |
// Normalize input text into lowercase, asciified tokens. | |
module.exports.tokenize = tokenize; | |
function tokenize(query, lonlat) { | |
if (lonlat) { | |
var numeric = query | |
.split(/[^\.\-\d+]+/i) | |
.filter(function(t) { return t.length; }) | |
.map(function(t) { return parseFloat(t); }) | |
.filter(function(t) { return !isNaN(t); }); | |
if (numeric.length === 2) return numeric; | |
} | |
// try { | |
var converted = iconv.convert(query).toString(); | |
query = converted; | |
// } catch(err) {} | |
return query | |
.toLowerCase() | |
.replace(/[\^]+/g, '') | |
.replace(/[-,]+/g, ' ') | |
.split(/[^\w+^\s+]/gi) | |
.join('') | |
.split(/[\s+]+/gi) | |
.filter(function(t) { return t.length; }); | |
} | |
var a = 'A. Zápotockého', | |
b = 'A. Quiroga'; | |
console.log('FNV:', a + '\n'); | |
console.log('result:', phrase(a) + '\n'); | |
console.log('FNV:', b + '\n'); | |
console.log('result:', phrase(b)); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FNV: A. Zápotockého | |
fold a zapotockeho into 1360009443 folded 7666 | |
fold a into 3826002220 folded 604776751 | |
a 7666 b 2351 | |
result: 31402287 | |
FNV: A. Quiroga | |
fold a quiroga into 549462014 folded 7666 | |
fold a into 3826002220 folded 604776751 | |
a 7666 b 2351 | |
result: 31402287 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment