Skip to content

Instantly share code, notes, and snippets.

@amsul
Last active November 7, 2019 19:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amsul/1154ffa36d8c90cca321948b3c7583f5 to your computer and use it in GitHub Desktop.
Save amsul/1154ffa36d8c90cca321948b3c7583f5 to your computer and use it in GitHub Desktop.
A stupidly simple fuzzy search
// @flow
import * as _ from 'lodash'
///////////////////
// FILTER & SORT //
///////////////////
type FilterAndSortType = ({
data: Array<*>,
getTargetText?: ({ item: * }) => string,
limit?: number,
searchText: string,
sortIteratees?: Array<(match: Object) => number>,
}) => Array<*>
export const filterAndSort: FilterAndSortType = ({
data,
getTargetText = ({ item }) => item,
limit = data.length,
searchText,
sortIteratees = [() => 0],
}) => {
if (!searchText) {
if (limit !== data.length) {
return data.slice(0, limit)
}
return data
}
return (
_.chain(data)
// Get matches for each item
.map(item =>
getMatch({
item,
searchText,
targetText: getTargetText({ item }),
})
)
// Filter out non-matches
.filter(match => {
console.debug(
'Match of %o in %o: isExact=%o isFuzzy=%o coefficient=%o',
match.searchText,
match.targetText,
match.isExact,
match.isFuzzy,
match.coefficient
)
return match.isExact || match.isFuzzy
// TODO: If it's a fuzzy match and only one character matches
// when there are multiple words in the search text, ignore it
})
// Sort by the coefficient, then by fuzzy matches, and then exact matches
.sortBy(match => match.coefficient * -1)
.sortBy(match => (match.isFuzzy ? -1 : 1))
.sortBy(match => (match.isExact ? -1 : 1))
// Finally, sort by the number of exact matches in the pairs of matches
.sortBy(match =>
match.pairs.reduce((accumulator, pair) => {
if (typeof pair !== 'string') {
return accumulator - (pair.isExact ? 2 : 1)
}
return accumulator
}, 0)
)
// Sort by all custom sorters
.sortBy(...sortIteratees)
// Slice down to the limit
.slice(0, limit)
// Pull out the original item and then return the final value
.map(match => match.item)
.value()
)
}
///////////
// SPLIT //
///////////
type SplitByMatchesType = ({
searchText: string,
targetText: string,
}) => Array<string | [string, string, string]>
export const splitByMatches: SplitByMatchesType = ({
searchText,
targetText,
}) => {
if (!searchText) {
return [targetText]
}
const match = getMatch({ item: targetText, searchText, targetText })
const matchedWords = match.pairs.map(pair => {
if (typeof pair === 'string') {
return pair
}
const indexOfMatchText = pair.targetWord.indexOf(pair.matchText)
const preMatchSliceEnd = indexOfMatchText > 0 ? indexOfMatchText : 0
const postMatchSliceStart = preMatchSliceEnd + pair.matchText.length
return [
pair.targetWord.slice(0, preMatchSliceEnd),
pair.matchText,
pair.targetWord.slice(postMatchSliceStart),
]
})
return reduceConsecutiveStrings({ values: matchedWords })
}
type ReduceConsecutiveStringsType = ({ values: Array<*> }) => Array<*>
const reduceConsecutiveStrings: ReduceConsecutiveStringsType = ({
values,
}) =>
values.reduce((accumulator, value, index) => {
if (typeof value === 'string' && typeof _.last(accumulator) === 'string') {
accumulator[accumulator.length - 1] = _.last(accumulator) + value
} else {
accumulator.push(value)
}
return accumulator
}, [])
////////////////////////
// WORDS & SEPARATORS //
////////////////////////
const SEPARATOR_CHARACTERS = [',', '.', '-', ' ', '|', '/']
const REGEXP_SEPARATORS = new RegExp(`[\\${SEPARATOR_CHARACTERS.join('\\')}]+`)
const REGEXP_SEPARATORS_CAPTURING = new RegExp(
`([\\${SEPARATOR_CHARACTERS.join('\\')}])`
)
const splitWords = ({ text, isCapturingSeparators }) => {
const regExp = isCapturingSeparators
? REGEXP_SEPARATORS_CAPTURING
: REGEXP_SEPARATORS
return text.split(regExp).filter(value => value)
}
const hasAnySeparator = ({ word }) =>
SEPARATOR_CHARACTERS.some(character => word.includes(character))
///////////
// MATCH //
///////////
type SearchWordMatchType = {
index: number,
coefficient: number,
matchText: string,
searchWord: string,
targetWord: string,
}
type SearchMatchType = {
item: any,
pairs: Array<string | SearchWordMatchType>,
searchText: string,
searchWords: string[],
targetText: string,
targetWords: string[],
coefficient: number,
isExact: boolean,
isFuzzy: boolean,
}
const getMatch = ({ item, searchText, targetText }): SearchMatchType => {
const searchWords = splitWords({
text: searchText,
isCapturingSeparators: false,
})
const targetWords = splitWords({
text: targetText,
isCapturingSeparators: true,
})
const targetWordsWithoutSeparators = splitWords({
text: targetText,
isCapturingSeparators: false,
})
const pairs = (targetWords.slice(0): any)
searchWords.forEach(searchWord => {
const match = getBestWordMatch({ pairs, searchWord, targetWords })
if (match) {
pairs[match.index] = match
}
})
return {
item,
pairs,
searchText,
searchWords,
targetText,
targetWords,
coefficient: getJaccardCoefficient({
matchText: pairs
.map(pair => (typeof pair === 'string' ? '' : pair.matchText))
.join(''),
searchWord: searchWords.join(''),
targetWord: targetWordsWithoutSeparators.join(''),
}),
isExact: normalize({ string: targetText }).startsWith(
normalize({ string: searchText })
),
isFuzzy: targetWords.some(targetWord =>
searchWords.some(searchWord =>
normalize({ string: targetWord }).includes(
normalize({ string: searchWord })
)
)
),
}
}
const getBestWordMatch = ({
pairs,
searchWord,
targetWords,
}): ?SearchWordMatchType =>
_.chain(targetWords)
.map((targetWord, index) =>
getWordMatch({ pairs, index, searchWord, targetWord })
)
.filter(match => match.coefficient)
.sortBy(match => match.coefficient * -1)
.sortBy(match => (match.isExact ? -1 : 1))
.first()
.value()
const getWordMatch = ({
pairs,
index,
searchWord,
targetWord,
}): SearchWordMatchType => {
const matchText =
typeof pairs[index] === 'string'
? getWordMatchText({ searchWord, targetWord })
: ''
return {
index,
coefficient: getJaccardCoefficient({ matchText, searchWord, targetWord }),
isExact: normalize({ string: targetWord }).startsWith(
normalize({ string: searchWord })
),
matchText,
searchWord,
targetWord,
}
}
const getWordMatchText = ({ searchWord, targetWord }) => {
if (SEPARATOR_CHARACTERS.includes(targetWord)) {
return ''
}
if (
hasAnySeparator({ word: targetWord }) ||
hasAnySeparator({ word: searchWord })
) {
throw new Error(
'Neither the `targetWord` nor the `searchWord` cannot contain separator characters'
)
}
const searchWordNormalized = normalize({ string: searchWord })
const targetWordNormalized = normalize({ string: targetWord })
let matchText = ''
let searchIndex = 0
let targetIndex = 0
let searchCharacter = null
let targetCharacter = null
while (
(searchCharacter = searchWordNormalized[searchIndex]) &&
(targetCharacter = targetWordNormalized[targetIndex])
) {
if (searchCharacter === targetCharacter) {
matchText += targetWord[targetIndex]
searchIndex += 1
} else if (matchText.length) {
return matchText
}
targetIndex += 1
}
return matchText
}
const getJaccardCoefficient = ({ matchText, searchWord, targetWord }) => {
if (!matchText) {
return 0
}
const intersection = matchText.length
const union = targetWord.length + searchWord.length - intersection
return intersection / union
}
/////////////
// HELPERS //
/////////////
// https://stackoverflow.com/questions/990904/remove-accents-diacritics-in-a-string-in-javascript/37511463#37511463
export const normalize = ({ string }: { string: string }): string =>
string
.normalize('NFD')
.replace(/[\u0300-\u036f]/g, '')
.replace(/['"“”’‘.,]/g, '')
.replace(/-/g, ' ')
.toLowerCase()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment