Created
March 7, 2018 02:17
-
-
Save jeancroy/9187627975a2632a8f8e4aaae17d358e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function IndexStore() { | |
// Main structure | |
// Format is dict key => array of item indices | |
this.indicesForKey = {}; | |
// List of indexed items. | |
this.items = []; | |
// Helper to prevent duplicate on items. | |
this.itemDict = {}; | |
} | |
// Add an item to the index | |
// eg: | |
// store.add( "text" ) | |
// store.add( item, x => x.keyToIndex ) | |
IndexStore.prototype.add = function (item, accessor) { | |
// get text value | |
var str = (accessor == null) ? item : accessor(item); | |
// Already in index | |
if (str in this.itemDict) | |
return; | |
// Nothing to index ? | |
var keyList = IndexStore.computeIndexKeys(str); | |
if (keyList.length == 0) return; | |
// Add to indexed items | |
var idx = this.items.length; | |
this.items.push(item); | |
this.itemDict[str] = true; | |
// notify all computed keys that we exist. | |
this.registerItem(idx, keyList); | |
}; | |
// This exist to prepare "Hook" variant. | |
IndexStore.prototype.registerItem = function (idx, keyList) { | |
// register idx on all appropriate key | |
for (var i = 0; i < keyList.length; i++) { | |
var key = keyList[i]; | |
if (key in this.indicesForKey) { | |
// append to existing array of index | |
this.indicesForKey[key].push(idx); | |
} | |
else { | |
// Format is dict key => array of item index | |
this.indicesForKey[key] = [idx]; | |
} | |
} | |
}; | |
// Find entries that match using 3 out of 5 strategy. (or appropriate fallback) | |
// Entry that match more "subset of 3" keys are returned first. | |
IndexStore.prototype.search = function (str) { | |
// Compute positive results | |
var keys = IndexStore.computeSearchKeys(str); | |
var tmp = this._search(keys); | |
var items = this.items; | |
return tmp.map(function (x) { | |
return items[x.id]; | |
// return items[x.id] + " - " + x.count; // debug output | |
}); | |
}; | |
// This exist to prepare Hook variant. | |
// Input computed keys and output list of IdAndCount. | |
IndexStore.prototype._search = function (keys) { | |
// Compute positive results | |
var outList = this.retrieveCount(keys); | |
// Minimum quality: half as best | |
if (outList.length > 1) { | |
var tresh = outList[0].count * 0.5; | |
outList = outList.filter(function (x) { | |
return x.count > tresh | |
}) | |
} | |
return outList; | |
}; | |
function IdAndCount(id, count) { | |
this.id = id; | |
this.count = count; | |
} | |
IndexStore.prototype.retrieveCount = function (keys) { | |
// Dictionary idx => count | |
var countPerIndex = {}; | |
if (keys.length == 0) | |
return countPerIndex; | |
for (var i = 0; i < keys.length; i++) { | |
var key = keys[i]; | |
// Does the key exist in the index ? | |
if (key in this.indicesForKey) { | |
// If so add every entry of that key into countPerIndex | |
// Also for each entry, maintain a count of matched keys. | |
var idxList = this.indicesForKey[key]; | |
for (var j = 0; j < idxList.length; j++) { | |
var idx = idxList[j]; | |
if (idx in countPerIndex) { | |
countPerIndex[idx]++; | |
} else { | |
countPerIndex[idx] = 1; | |
} | |
} | |
} | |
} | |
// Transform countPerIndex into a sorted list of IdAndCount | |
var outList = []; | |
for (var id in countPerIndex) { | |
if (countPerIndex.hasOwnProperty(id)) { | |
outList.push(new IdAndCount(id, countPerIndex[id])); | |
} | |
} | |
// Custom sort decreasing order | |
outList = outList.sort(function (a, b) { | |
return b.count - a.count | |
}); | |
return outList; | |
}; | |
// Split string in words | |
// And get keys for each of them. | |
IndexStore.computeIndexKeys = function (str) { | |
var keysList = []; | |
var words = str.split(" "); | |
var existingDict = {}; | |
for (var i = 0; i < words.length; i++) { | |
var word = words[i].toLowerCase(); | |
if (word.length < 3) continue; | |
computeIndexKeysFromWord(word, keysList, existingDict) | |
} | |
return keysList; | |
}; | |
IndexStore.computeSearchKeys = function (str) { | |
var keysList = []; | |
var words = str.split(" "); | |
for (var i = 0; i < words.length; i++) { | |
var word = words[i].toLowerCase(); | |
// Either: | |
// Use same strategy as index building | |
// That is 3oY and 2oY and 1o1 | |
// (slower, better score quality). | |
computeIndexKeysFromWord(word, keysList, {}); | |
// Or: | |
// use a single strategy as character permit. | |
// Single XoY (lower search cost) | |
// computeSearchKeysFromWord(word, keysList, {}) | |
} | |
return keysList; | |
}; | |
// Private Helpers | |
// | |
// Keys during index | |
// compute either 3o6, 3o5, 3o4 or 3o3 as available (note that 3o6 include all 3o5 keys) | |
// compute either 2o4, 2o3 or 2o2 as available. (note that 2o4 include all 2o3 keys) | |
// compute 1o1 (first letter index) | |
// | |
function computeIndexKeysFromWord(word, keysList, existingDict) { | |
var len = word.length; | |
if (len == 0) return; | |
if (len >= 3) { | |
// 3o6, 3o5, 3o4, 3o3 | |
select3(word, 6, keysList, existingDict) | |
} | |
if (len >= 2) { | |
// 2o4, 2o3,2o2 | |
select2(word, 4, keysList, existingDict) | |
} | |
// 1o1 strategy: This index by first letter | |
_append(word[0], keysList, existingDict); | |
} | |
// | |
// Keys during search | |
// compute a single variation from maximum characters available | |
// 3o6, 3o5, 2o4, 2o3, 2o2, 1o1 | |
// | |
// alternatively use same strategy as index. | |
// I'm not totally fixed as that. | |
// | |
// also note that we "cheat" a bit to help transition between key size | |
// ie include 3o3 with the 2oY | |
function computeSearchKeysFromWord(word, keysList, existingDict) { | |
var len = word.length; | |
if (len == 0) return; | |
// Include 3oY if we have at least 3 letters. | |
if (len >= 3) { | |
// 3o6, 3o5, 3o4, 3o3 | |
select3(word, 6, keysList, existingDict); | |
} | |
// Include 2oY if we have 4 or less. | |
if (len <= 4) { | |
// 2o4, 2o3, 2o2 | |
select2(word, 4, keysList, existingDict); | |
// Include 1o1 at 2 or below | |
if (len <= 2) { | |
_append(word[0], keysList, existingDict); | |
} | |
} | |
} | |
function select2(str, maxlen, existingList, existingDict) { | |
var len = Math.min(str.length, maxlen); | |
for (var i = 0; i < len - 1; i++) { | |
for (var j = i + 1; j < len; j++) { | |
_append(str[i] + str[j], existingList, existingDict) | |
} | |
} | |
return existingList; | |
} | |
function select3(str, maxlen, existingList, existingDict) { | |
var len = Math.min(str.length, maxlen); | |
for (var i = 0; i < len - 2; i++) { | |
for (var j = i + 1; j < len - 1; j++) { | |
for (var k = j + 1; k < len; k++) { | |
_append(str[i] + str[j] + str[k], existingList, existingDict) | |
} | |
} | |
} | |
return existingList; | |
} | |
function _append(word, existingList, existingDict) { | |
if (!(word in existingDict)) { | |
existingDict[word] = true; | |
existingList.push(word); | |
} | |
} | |
// | |
// Test | |
// | |
function sel3(str) { | |
return select3(str, Infinity, [], {}); | |
} | |
console.log("sel3(hello)", sel3("hello")); | |
console.log("sel3(hills)", sel3("hills")); | |
console.log("sel3(jello)", sel3("jello")); | |
var store = new IndexStore(); | |
store.add("hello"); | |
store.add("cello"); | |
store.add("shall"); | |
store.add("shill"); | |
store.add("chill"); | |
store.add("hills"); | |
store.add("hells"); | |
//console.log("store content", store.indicesForKey); | |
var term; | |
term = "shill"; | |
console.log(term, store.search(term)); | |
term = "shell"; | |
console.log(term, store.search(term)); | |
term = "jello"; | |
console.log(term, store.search(term)); | |
term = "holy"; | |
console.log(term, store.search(term)); | |
term = "hi"; | |
console.log(term, store.search(term)); | |
term = "hil"; | |
console.log(term, store.search(term)); | |
term = "hll"; | |
console.log(term, store.search(term)); | |
term = "hill"; | |
console.log(term, store.search(term)); | |
/*** | |
### Equivalence with Indel / transpose | |
- query => candidate. | |
#### How to deal with insert ? | |
- query => querya | |
- Match 3o5, 3o6 | |
- querya => queryab | |
- Handled by prefix cut, addition are free after 6th of candidate | |
- quer => query | |
- Match 2o4, 2o4 (at index time, multiple XoY are computed 3o5,2o4) | |
- que => query | |
- Match 2o3, 2o4 | |
#### How to deal with delete ? | |
- querA => quer | |
- Match 3o5, 3o4 (at index time 4 char get both 2o4 and 3o4) | |
- queryA => query | |
- Match 3o6, 3o5 | |
- queryAB => query | |
- Match 3o6, 3o5, delete free after 6th of query. | |
- quer => que | |
- Match 2o4, 2o3 | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment