Skip to content

Instantly share code, notes, and snippets.

@eestrada
Last active January 23, 2020 16:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eestrada/f1c84bb12ba1dced7adc to your computer and use it in GitHub Desktop.
Save eestrada/f1c84bb12ba1dced7adc to your computer and use it in GitHub Desktop.
fuzzy-matching
# This is free and unencumbered software released into the public domain
# using the wording of the Unlicense. Please refer to <http://unlicense.org/>
from __future__ import division, absolute_import, print_function, unicode_literals
"""Use the stdlib module `difflib` to do fuzzy matching.
This module also has mixin classes to make searching containers easier.
It will also use LRU caching when available.
"""
import logging
import difflib
import operator
import functools
import itertools
logging.basicConfig(level=logging.DEBUG)
_log = logging.getLogger(__name__)
try:
# NOTE: python2
from itertools import ifilterfalse as filterfalse
from itertools import ifilter as filter
except ImportError as e:
# NOTE: python3
from itertools import filterfalse
try:
from functools import lru_cache
_log.debug('imported lru_cache from functools')
except ImportError as e:
_log.debug(e, exc_info=True)
try:
from backports.functools_lru_cache import lru_cache
_log.debug('imported lru_cache from backports.functools_lru_cache')
except ImportError as e:
_log.debug(e, exc_info=True)
# NOTE: dummy decorator that does nothing
def lru_cache(maxsize=None, typed=False):
def decorator(function):
return function
return decorator
_log.debug('created dummy lru_cache')
_DEFAULT_MAXSIZE = 128
class Matcher(object):
"""A mixin class to be used with container types to add strict matching."""
__slots__ = ()
@staticmethod
def _startswith_func(word, case_sensitive=False):
if not case_sensitive:
word = word.casefold()
return lambda v: v.casefold().startswith(word)
else:
return operator.methodcaller('startswith', word)
def get_startswith(self, word, case_sensitive=False):
"""Return an iterator over all words that start with the given word.
This is a subset of ``get_contains``.
"""
return filter(self._startswith_func(word, case_sensitive), self)
def get_contains(self, word, case_sensitive=False):
"""Return an iterator over all words that contain the given word.
This is a superset of ``get_startswith``.
"""
if not case_sensitive:
word = word.casefold()
func = lambda v: v.casefold().__contains__(word)
else:
func = operator.methodcaller('__contains__', word)
# we should return words that start with the given word first
# although this means we will iterate twice, it also means it will be
# more inline with what the user would expect.
startswith = self.get_startswith(word, case_sensitive)
contains = filterfalse(self._startswith_func(word, case_sensitive), self)
contains = filter(func, contains)
return itertools.chain(startswith, contains)
class FuzzyMatcher(Matcher):
"""A mixin class to be used with container types to add fuzzy matching.
Also inherits from Matcher class. Fuzzy matching is much slower than
strict matching."""
__slots__ = ()
def get_close_matches(self, word, cutoff=None):
"""Yield an iterator of close matches.
The iterator is not in any sorted order, neither alphanumeric nor based
on ratio. What is yielded is tuples of the format (ratio, word). To
sort based on ratio simply call ``sorted(matches)``.
You may optionally specify a cutoff ratio. The default is `0.8`. A
ratio of `0.0` should mean that anything will match. A ratio of `1.0`
should mean that only an exact match would match.
"""
cutoff = getattr(self, 'cutoff', 0.8) if cutoff is None else cutoff
if not (0.0 <= cutoff <= 1.0):
raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
s = difflib.SequenceMatcher()
s.set_seq2(word)
for x in self:
s.set_seq1(x)
if (s.real_quick_ratio() >= cutoff and
s.quick_ratio() >= cutoff and
s.ratio() >= cutoff):
yield (s.ratio(), x)
def get_sorted_matches(self, word, cutoff=None, byratio=True, stripratio=True):
matches = self.get_close_matches(word, cutoff=None)
if byratio:
sortedresult = sorted(matches)
else:
sortedresult = sorted(matches, key=lambda t: t[1])
if stripratio:
return list(map(lambda t: t[1], sortedresult))
else:
return sortedresult
class CachedFuzzyMatcher(FuzzyMatcher):
"""A mixin class for immutable containers that caches previous queries."""
__slots__ = ()
get_startswith = lru_cache(maxsize=_DEFAULT_MAXSIZE, typed=False)(FuzzyMatcher.get_startswith)
get_contains = lru_cache(maxsize=_DEFAULT_MAXSIZE, typed=False)(FuzzyMatcher.get_contains)
get_close_matches = lru_cache(maxsize=_DEFAULT_MAXSIZE, typed=False)(FuzzyMatcher.get_close_matches)
get_sorted_matches = lru_cache(maxsize=_DEFAULT_MAXSIZE, typed=False)(FuzzyMatcher.get_sorted_matches)
class FuzzySet(frozenset, CachedFuzzyMatcher):
"""A frozenset class that can return fuzzy matches."""
def __init__(self, it=(), cutoff=0.8):
super(FuzzySet, self).__init__(it)
self.cutoff = cutoff
class FuzzyTuple(tuple, CachedFuzzyMatcher):
"""A tuple class that can return fuzzy matches.
This is useful if you want results returned in a certain order.
"""
def __init__(self, it=(), cutoff=0.8):
super(FuzzyTuple, self).__init__(it)
self.cutoff = cutoff
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment