Last active
May 29, 2024 08:33
-
-
Save espretto/41e8e0d78377ce100fc1 to your computer and use it in GitHub Desktop.
ts: fuzzy search
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import escapeRegExp from "lodash.escaperegexp"; | |
/** | |
* used to create regular expressions that match if all needle-chars appear in | |
* the same order w/ any number of inbetween chars. | |
*/ | |
export function fuzzyRegExp(needle: string) { | |
return new RegExp(needle.split("").map(escapeRegExp).join(".*?"), "i"); | |
} | |
const isASCIIPrintable = (char: string) => { | |
const charCode = char.charCodeAt(0); | |
return 32 <= charCode && charCode < 127; | |
}; | |
/** | |
* if the "starter" of the given character's NFKD normalization is an ASCII printable, return it. | |
* otherwise return the given character. | |
* | |
* A "starter" is the first char with a "canonical combining class value" of zero. | |
* @see https://unicode.org/reports/tr15/#Norm_Forms | |
*/ | |
const extractASCIIStarter = (char: string) => { | |
const starter = char.normalize("NFKD").charAt(0); | |
return isASCIIPrintable(starter) ? starter : char; | |
}; | |
/** | |
* the needle is found if all needle-chars appear in the same order w/ any | |
* number of inbetween chars. | |
* | |
* Any composed forms in the haystack will be decomposed to their NFKD normalization and reduced to | |
* their "starter" characters or first characters in case of ligatures IF these are ASCII printable. | |
* | |
* @returns match-array or null if not matched. | |
* match-array items at even indices hold matched needle-chars. | |
* match-array items at odd indices hold inbetween chars. | |
*/ | |
export function fuzzyMatch( | |
needle: string, | |
haystack: string, | |
strict: boolean = false, | |
) { | |
const needles = needle.toLowerCase(); | |
const nHaystack = haystack.toLowerCase().split("").map(extractASCIIStarter); | |
// indices of both matched and unmatched substrings (cf. \b in regex) | |
const boundaries = [0]; | |
let anchor = 0; | |
for (let i = 0; i < needles.length; i++) { | |
const seeker = nHaystack.indexOf(needles.charAt(i), anchor); | |
if (seeker < 0) { | |
if (strict) return null; | |
else continue; | |
} | |
if (seeker > anchor) boundaries.push(anchor, seeker); | |
anchor = seeker + 1; | |
} | |
if (anchor < nHaystack.length) boundaries.push(anchor); | |
boundaries.push(nHaystack.length); | |
const matchArray = []; | |
for (let i = 1; i < boundaries.length; i++) { | |
matchArray.push(haystack.substring(boundaries[i - 1], boundaries[i])); | |
} | |
return matchArray; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment