// LICENSE | |
// | |
// This software is dual-licensed to the public domain and under the following | |
// license: you are granted a perpetual, irrevocable license to copy, modify, | |
// publish, and distribute this file as you see fit. | |
// | |
// VERSION | |
// 0.1.0 (2016-03-28) Initial release | |
// | |
// AUTHOR | |
// Forrest Smith | |
// | |
// CONTRIBUTORS | |
// Jørgen Tjernø - async helper | |
// Returns true if each character in pattern is found sequentially within str | |
export function fuzzy_match_simple(pattern, str) { | |
var patternIdx = 0; | |
var strIdx = 0; | |
var patternLength = pattern.length; | |
var strLength = str.length; | |
while (patternIdx !== patternLength && strIdx !== strLength) { | |
var patternChar = pattern.charAt(patternIdx).toLowerCase(); | |
var strChar = str.charAt(strIdx).toLowerCase(); | |
if (patternChar === strChar) | |
++patternIdx; | |
++strIdx; | |
} | |
return patternLength !== 0 && strLength !== 0 && patternIdx === patternLength; | |
} | |
// Returns [bool, score, formattedStr] | |
// bool: true if each character in pattern is found sequentially within str | |
// score: integer; higher is better match. Value has no intrinsic meaning. Range varies with pattern. | |
// Can only compare scores with same search pattern. | |
// formattedStr: input str with matched characters marked in <b> tags. Delete if unwanted. | |
export function fuzzy_match(pattern, str) { | |
// Score consts | |
var adjacency_bonus = 5; // bonus for adjacent matches | |
var separator_bonus = 10; // bonus if match occurs after a separator | |
var camel_bonus = 10; // bonus if match is uppercase and prev is lower | |
var leading_letter_penalty = -3; // penalty applied for every letter in str before the first match | |
var max_leading_letter_penalty = -9; // maximum penalty for leading letters | |
var unmatched_letter_penalty = -1; // penalty for every letter that doesn't matter | |
// Loop variables | |
var score = 0; | |
var patternIdx = 0; | |
var patternLength = pattern.length; | |
var strIdx = 0; | |
var strLength = str.length; | |
var prevMatched = false; | |
var prevLower = false; | |
var prevSeparator = true; // true so if first letter match gets separator bonus | |
// Use "best" matched letter if multiple string letters match the pattern | |
var bestLetter = null; | |
var bestLower = null; | |
var bestLetterIdx = null; | |
var bestLetterScore = 0; | |
var matchedIndices = []; | |
// Loop over strings | |
while (strIdx !== strLength) { | |
var patternChar = patternIdx !== patternLength ? pattern.charAt(patternIdx) : null; | |
var strChar = str.charAt(strIdx); | |
var patternLower = patternChar != null ? patternChar.toLowerCase() : null; | |
var strLower = strChar.toLowerCase(); | |
var strUpper = strChar.toUpperCase(); | |
var nextMatch = patternChar && patternLower === strLower; | |
var rematch = bestLetter && bestLower === strLower; | |
var advanced = nextMatch && bestLetter; | |
var patternRepeat = bestLetter && patternChar && bestLower === patternLower; | |
if (advanced || patternRepeat) { | |
score += bestLetterScore; | |
matchedIndices.push(bestLetterIdx); | |
bestLetter = null; | |
bestLower = null; | |
bestLetterIdx = null; | |
bestLetterScore = 0; | |
} | |
if (nextMatch || rematch) { | |
var newScore = 0; | |
// Apply penalty for each letter before the first pattern match | |
// Note: std::max because penalties are negative values. So max is smallest penalty. | |
if (patternIdx === 0) { | |
score += Math.max(strIdx * leading_letter_penalty, max_leading_letter_penalty); | |
} | |
// Apply bonus for consecutive bonuses | |
if (prevMatched) | |
newScore += adjacency_bonus; | |
// Apply bonus for matches after a separator | |
if (prevSeparator) | |
newScore += separator_bonus; | |
// Apply bonus across camel case boundaries. Includes "clever" isLetter check. | |
if (prevLower && strChar === strUpper && strLower !== strUpper) | |
newScore += camel_bonus; | |
// Update patter index IFF the next pattern letter was matched | |
if (nextMatch) | |
++patternIdx; | |
// Update best letter in str which may be for a "next" letter or a "rematch" | |
if (newScore >= bestLetterScore) { | |
// Apply penalty for now skipped letter | |
if (bestLetter != null) | |
score += unmatched_letter_penalty; | |
bestLetter = strChar; | |
bestLower = bestLetter.toLowerCase(); | |
bestLetterIdx = strIdx; | |
bestLetterScore = newScore; | |
} | |
prevMatched = true; | |
} | |
else { | |
// Append unmatch characters | |
formattedStr += strChar; | |
score += unmatched_letter_penalty; | |
prevMatched = false; | |
} | |
// Includes "clever" isLetter check. | |
prevLower = strChar === strLower && strLower !== strUpper; | |
prevSeparator = strChar === '_' || strChar === ' '; | |
++strIdx; | |
} | |
// Apply score for last match | |
if (bestLetter) { | |
score += bestLetterScore; | |
matchedIndices.push(bestLetterIdx); | |
} | |
// Finish out formatted string after last pattern matched | |
// Build formated string based on matched letters | |
var formattedStr = ""; | |
var lastIdx = 0; | |
for (var i = 0; i < matchedIndices.length; ++i) { | |
var idx = matchedIndices[i]; | |
formattedStr += str.substr(lastIdx, idx - lastIdx) + "<b>" + str.charAt(idx) + "</b>"; | |
lastIdx = idx + 1; | |
} | |
formattedStr += str.substr(lastIdx, str.length - lastIdx); | |
var matched = patternIdx === patternLength; | |
return [matched, score, formattedStr]; | |
} | |
// Strictly optional utility to help make using fts_fuzzy_match easier for large data sets | |
// Uses setTimeout to process matches before a maximum amount of time before sleeping | |
// | |
// To use: | |
// var asyncMatcher = new fts_fuzzy_match(fuzzy_match, "fts", "ForrestTheWoods", | |
// function(results) { console.log(results); }); | |
// asyncMatcher.start(); | |
// | |
export function fts_fuzzy_match_async(matchFn, pattern, dataSet, onComplete) { | |
var ITEMS_PER_CHECK = 1000; // performance.now can be very slow depending on platform | |
var max_ms_per_frame = 1000.0/30.0; // 30FPS | |
var dataIndex = 0; | |
var results = []; | |
var resumeTimeout = null; | |
// Perform matches for at most max_ms | |
function step() { | |
clearTimeout(resumeTimeout); | |
resumeTimeout = null; | |
var stopTime = performance.now() + max_ms_per_frame; | |
for (; dataIndex < dataSet.length; ++dataIndex) { | |
if ((dataIndex % ITEMS_PER_CHECK) === 0) { | |
if (performance.now() > stopTime) { | |
resumeTimeout = setTimeout(step, 1); | |
return; | |
} | |
} | |
var str = dataSet[dataIndex]; | |
var result = matchFn(pattern, str); | |
// A little gross because fuzzy_match_simple and fuzzy_match return different things | |
if (matchFn === fuzzy_match_simple && result === true) | |
results.push(str); | |
else if (matchFn === fuzzy_match && result[0] === true) | |
results.push(result); | |
} | |
onComplete(results); | |
return null; | |
} | |
// Abort current process | |
this.cancel = function() { | |
if (resumeTimeout !== null) | |
clearTimeout(resumeTimeout); | |
}; | |
// Must be called to start matching. | |
// I tried to make asyncMatcher auto-start via "var resumeTimeout = step();" | |
// However setTimout behaving in an unexpected fashion as onComplete insisted on triggering twice. | |
this.start = function() { | |
step(); | |
}; | |
// Process full list. Blocks script execution until complete | |
this.flush = function() { | |
max_ms_per_frame = Infinity; | |
step(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment