Last active
January 23, 2020 16:30
-
-
Save eestrada/f1c84bb12ba1dced7adc to your computer and use it in GitHub Desktop.
fuzzy-matching
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is free and unencumbered software released into the public domain | |
# using the wording of the Unlicense. Please refer to <http://unlicense.org/> | |
from __future__ import division, absolute_import, print_function, unicode_literals | |
"""Use the stdlib module `difflib` to do fuzzy matching. | |
This module also has mixin classes to make searching containers easier. | |
It will also use LRU caching when available. | |
""" | |
import logging | |
import difflib | |
import operator | |
import functools | |
import itertools | |
logging.basicConfig(level=logging.DEBUG) | |
_log = logging.getLogger(__name__) | |
try: | |
# NOTE: python2 | |
from itertools import ifilterfalse as filterfalse | |
from itertools import ifilter as filter | |
except ImportError as e: | |
# NOTE: python3 | |
from itertools import filterfalse | |
try: | |
from functools import lru_cache | |
_log.debug('imported lru_cache from functools') | |
except ImportError as e: | |
_log.debug(e, exc_info=True) | |
try: | |
from backports.functools_lru_cache import lru_cache | |
_log.debug('imported lru_cache from backports.functools_lru_cache') | |
except ImportError as e: | |
_log.debug(e, exc_info=True) | |
# NOTE: dummy decorator that does nothing | |
def lru_cache(maxsize=None, typed=False): | |
def decorator(function): | |
return function | |
return decorator | |
_log.debug('created dummy lru_cache') | |
_DEFAULT_MAXSIZE = 128 | |
class Matcher(object): | |
"""A mixin class to be used with container types to add strict matching.""" | |
__slots__ = () | |
@staticmethod | |
def _startswith_func(word, case_sensitive=False): | |
if not case_sensitive: | |
word = word.casefold() | |
return lambda v: v.casefold().startswith(word) | |
else: | |
return operator.methodcaller('startswith', word) | |
def get_startswith(self, word, case_sensitive=False): | |
"""Return an iterator over all words that start with the given word. | |
This is a subset of ``get_contains``. | |
""" | |
return filter(self._startswith_func(word, case_sensitive), self) | |
def get_contains(self, word, case_sensitive=False): | |
"""Return an iterator over all words that contain the given word. | |
This is a superset of ``get_startswith``. | |
""" | |
if not case_sensitive: | |
word = word.casefold() | |
func = lambda v: v.casefold().__contains__(word) | |
else: | |
func = operator.methodcaller('__contains__', word) | |
# we should return words that start with the given word first | |
# although this means we will iterate twice, it also means it will be | |
# more inline with what the user would expect. | |
startswith = self.get_startswith(word, case_sensitive) | |
contains = filterfalse(self._startswith_func(word, case_sensitive), self) | |
contains = filter(func, contains) | |
return itertools.chain(startswith, contains) | |
class FuzzyMatcher(Matcher): | |
"""A mixin class to be used with container types to add fuzzy matching. | |
Also inherits from Matcher class. Fuzzy matching is much slower than | |
strict matching.""" | |
__slots__ = () | |
def get_close_matches(self, word, cutoff=None): | |
"""Yield an iterator of close matches. | |
The iterator is not in any sorted order, neither alphanumeric nor based | |
on ratio. What is yielded is tuples of the format (ratio, word). To | |
sort based on ratio simply call ``sorted(matches)``. | |
You may optionally specify a cutoff ratio. The default is `0.8`. A | |
ratio of `0.0` should mean that anything will match. A ratio of `1.0` | |
should mean that only an exact match would match. | |
""" | |
cutoff = getattr(self, 'cutoff', 0.8) if cutoff is None else cutoff | |
if not (0.0 <= cutoff <= 1.0): | |
raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) | |
s = difflib.SequenceMatcher() | |
s.set_seq2(word) | |
for x in self: | |
s.set_seq1(x) | |
if (s.real_quick_ratio() >= cutoff and | |
s.quick_ratio() >= cutoff and | |
s.ratio() >= cutoff): | |
yield (s.ratio(), x) | |
def get_sorted_matches(self, word, cutoff=None, byratio=True, stripratio=True): | |
matches = self.get_close_matches(word, cutoff=None) | |
if byratio: | |
sortedresult = sorted(matches) | |
else: | |
sortedresult = sorted(matches, key=lambda t: t[1]) | |
if stripratio: | |
return list(map(lambda t: t[1], sortedresult)) | |
else: | |
return sortedresult | |
class CachedFuzzyMatcher(FuzzyMatcher): | |
"""A mixin class for immutable containers that caches previous queries.""" | |
__slots__ = () | |
get_startswith = lru_cache(maxsize=_DEFAULT_MAXSIZE, typed=False)(FuzzyMatcher.get_startswith) | |
get_contains = lru_cache(maxsize=_DEFAULT_MAXSIZE, typed=False)(FuzzyMatcher.get_contains) | |
get_close_matches = lru_cache(maxsize=_DEFAULT_MAXSIZE, typed=False)(FuzzyMatcher.get_close_matches) | |
get_sorted_matches = lru_cache(maxsize=_DEFAULT_MAXSIZE, typed=False)(FuzzyMatcher.get_sorted_matches) | |
class FuzzySet(frozenset, CachedFuzzyMatcher): | |
"""A frozenset class that can return fuzzy matches.""" | |
def __init__(self, it=(), cutoff=0.8): | |
super(FuzzySet, self).__init__(it) | |
self.cutoff = cutoff | |
class FuzzyTuple(tuple, CachedFuzzyMatcher): | |
"""A tuple class that can return fuzzy matches. | |
This is useful if you want results returned in a certain order. | |
""" | |
def __init__(self, it=(), cutoff=0.8): | |
super(FuzzyTuple, self).__init__(it) | |
self.cutoff = cutoff | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment