eestrada/fuzzymatching.py

## fuzzymatching.py
# This is free and unencumbered software released into the public domain
# using the wording of the Unlicense. Please refer to <http://unlicense.org/>

from __future__ import division, absolute_import, print_function, unicode_literals

"""Use the stdlib module `difflib` to do fuzzy matching.

This module also has mixin classes to make searching containers easier.

It will also use LRU caching when available.
"""
import logging
import difflib
import operator
import functools
import itertools

logging.basicConfig(level=logging.DEBUG)
_log = logging.getLogger(__name__)

try:
    # NOTE: python2
    from itertools import ifilterfalse as filterfalse
    from itertools import ifilter as filter
except ImportError as e:
    # NOTE: python3
    from itertools import filterfalse

try:
    from functools import lru_cache
    _log.debug('imported lru_cache from functools')
except ImportError as e:
    _log.debug(e, exc_info=True)
    try:
        from backports.functools_lru_cache import lru_cache
        _log.debug('imported lru_cache from backports.functools_lru_cache')
    except ImportError as e:
        _log.debug(e, exc_info=True)
        # NOTE: dummy decorator that does nothing
        def lru_cache(maxsize=None, typed=False):
            def decorator(function):
                return function
            return decorator
        _log.debug('created dummy lru_cache')


_DEFAULT_MAXSIZE = 128


class Matcher(object):

    """A mixin class to be used with container types to add strict matching."""

    __slots__ = ()

    @staticmethod
    def _startswith_func(word, case_sensitive=False):
        if not case_sensitive:
            word = word.casefold()
            return lambda v: v.casefold().startswith(word)
        else:
            return operator.methodcaller('startswith', word)

    def get_startswith(self, word, case_sensitive=False):
        """Return an iterator over all words that start with the given word.

        This is a subset of ``get_contains``.
        """
        return filter(self._startswith_func(word, case_sensitive), self)

    def get_contains(self, word, case_sensitive=False):
        """Return an iterator over all words that contain the given word.

        This is a superset of ``get_startswith``.
        """
        if not case_sensitive:
            word = word.casefold()
            func = lambda v: v.casefold().__contains__(word)
        else:
            func = operator.methodcaller('__contains__', word)

        # we should return words that start with the given word first
        # although this means we will iterate twice, it also means it will be
        # more inline with what the user would expect.
        startswith = self.get_startswith(word, case_sensitive)
        contains = filterfalse(self._startswith_func(word, case_sensitive), self)
        contains = filter(func, contains)

        return itertools.chain(startswith, contains)


class FuzzyMatcher(Matcher):

    """A mixin class to be used with container types to add fuzzy matching.

    Also inherits from Matcher class. Fuzzy matching is much slower than
    strict matching."""

    __slots__ = ()

    def get_close_matches(self, word, cutoff=None):
        """Yield an iterator of close matches.

        The iterator is not in any sorted order, neither alphanumeric nor based
        on ratio. What is yielded is tuples of the format (ratio, word). To
        sort based on ratio simply call ``sorted(matches)``.

        You may optionally specify a cutoff ratio. The default is `0.8`. A
        ratio of `0.0` should mean that anything will match. A ratio of `1.0`
        should mean that only an exact match would match.
        """
        cutoff = getattr(self, 'cutoff', 0.8) if cutoff is None else cutoff
        if not (0.0 <= cutoff <= 1.0):
            raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))

        s = difflib.SequenceMatcher()
        s.set_seq2(word)
        for x in self:
            s.set_seq1(x)
            if (s.real_quick_ratio() >= cutoff and
                s.quick_ratio() >= cutoff and
                s.ratio() >= cutoff):
                    yield (s.ratio(), x)

    def get_sorted_matches(self, word, cutoff=None, byratio=True, stripratio=True):
        matches = self.get_close_matches(word, cutoff=None)
        if byratio:
            sortedresult = sorted(matches)
        else:
            sortedresult = sorted(matches, key=lambda t: t[1])

        if stripratio:
            return list(map(lambda t: t[1], sortedresult))
        else:
            return sortedresult


class CachedFuzzyMatcher(FuzzyMatcher):

    """A mixin class for immutable containers that caches previous queries."""

    __slots__ = ()

    get_startswith = lru_cache(maxsize=_DEFAULT_MAXSIZE, typed=False)(FuzzyMatcher.get_startswith)
    get_contains = lru_cache(maxsize=_DEFAULT_MAXSIZE, typed=False)(FuzzyMatcher.get_contains)

    get_close_matches = lru_cache(maxsize=_DEFAULT_MAXSIZE, typed=False)(FuzzyMatcher.get_close_matches)
    get_sorted_matches = lru_cache(maxsize=_DEFAULT_MAXSIZE, typed=False)(FuzzyMatcher.get_sorted_matches)


class FuzzySet(frozenset, CachedFuzzyMatcher):

    """A frozenset class that can return fuzzy matches."""

    def __init__(self, it=(), cutoff=0.8):
        super(FuzzySet, self).__init__(it)
        self.cutoff = cutoff


class FuzzyTuple(tuple, CachedFuzzyMatcher):

    """A tuple class that can return fuzzy matches.

    This is useful if you want results returned in a certain order.
    """

    def __init__(self, it=(), cutoff=0.8):
        super(FuzzyTuple, self).__init__(it)
        self.cutoff = cutoff
	# This is free and unencumbered software released into the public domain
	# using the wording of the Unlicense. Please refer to <http://unlicense.org/>

	from __future__ import division, absolute_import, print_function, unicode_literals

	"""Use the stdlib module `difflib` to do fuzzy matching.

	This module also has mixin classes to make searching containers easier.

	It will also use LRU caching when available.
	"""
	import logging
	import difflib
	import operator
	import functools
	import itertools

	logging.basicConfig(level=logging.DEBUG)
	_log = logging.getLogger(__name__)

	try:
	# NOTE: python2
	from itertools import ifilterfalse as filterfalse
	from itertools import ifilter as filter
	except ImportError as e:
	# NOTE: python3
	from itertools import filterfalse

	try:
	from functools import lru_cache
	_log.debug('imported lru_cache from functools')
	except ImportError as e:
	_log.debug(e, exc_info=True)
	try:
	from backports.functools_lru_cache import lru_cache
	_log.debug('imported lru_cache from backports.functools_lru_cache')
	except ImportError as e:
	_log.debug(e, exc_info=True)
	# NOTE: dummy decorator that does nothing
	def lru_cache(maxsize=None, typed=False):
	def decorator(function):
	return function
	return decorator
	_log.debug('created dummy lru_cache')


	_DEFAULT_MAXSIZE = 128


	class Matcher(object):

	"""A mixin class to be used with container types to add strict matching."""

	__slots__ = ()

	@staticmethod
	def _startswith_func(word, case_sensitive=False):
	if not case_sensitive:
	word = word.casefold()
	return lambda v: v.casefold().startswith(word)
	else:
	return operator.methodcaller('startswith', word)

	def get_startswith(self, word, case_sensitive=False):
	"""Return an iterator over all words that start with the given word.

	This is a subset of ``get_contains``.
	"""
	return filter(self._startswith_func(word, case_sensitive), self)

	def get_contains(self, word, case_sensitive=False):
	"""Return an iterator over all words that contain the given word.

	This is a superset of ``get_startswith``.
	"""
	if not case_sensitive:
	word = word.casefold()
	func = lambda v: v.casefold().__contains__(word)
	else:
	func = operator.methodcaller('__contains__', word)

	# we should return words that start with the given word first
	# although this means we will iterate twice, it also means it will be
	# more inline with what the user would expect.
	startswith = self.get_startswith(word, case_sensitive)
	contains = filterfalse(self._startswith_func(word, case_sensitive), self)
	contains = filter(func, contains)

	return itertools.chain(startswith, contains)


	class FuzzyMatcher(Matcher):

	"""A mixin class to be used with container types to add fuzzy matching.

	Also inherits from Matcher class. Fuzzy matching is much slower than
	strict matching."""

	__slots__ = ()

	def get_close_matches(self, word, cutoff=None):
	"""Yield an iterator of close matches.

	The iterator is not in any sorted order, neither alphanumeric nor based
	on ratio. What is yielded is tuples of the format (ratio, word). To
	sort based on ratio simply call ``sorted(matches)``.

	You may optionally specify a cutoff ratio. The default is `0.8`. A
	ratio of `0.0` should mean that anything will match. A ratio of `1.0`
	should mean that only an exact match would match.
	"""
	cutoff = getattr(self, 'cutoff', 0.8) if cutoff is None else cutoff
	if not (0.0 <= cutoff <= 1.0):
	raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))

	s = difflib.SequenceMatcher()
	s.set_seq2(word)
	for x in self:
	s.set_seq1(x)
	if (s.real_quick_ratio() >= cutoff and
	s.quick_ratio() >= cutoff and
	s.ratio() >= cutoff):
	yield (s.ratio(), x)

	def get_sorted_matches(self, word, cutoff=None, byratio=True, stripratio=True):
	matches = self.get_close_matches(word, cutoff=None)
	if byratio:
	sortedresult = sorted(matches)
	else:
	sortedresult = sorted(matches, key=lambda t: t[1])

	if stripratio:
	return list(map(lambda t: t[1], sortedresult))
	else:
	return sortedresult


	class CachedFuzzyMatcher(FuzzyMatcher):

	"""A mixin class for immutable containers that caches previous queries."""

	__slots__ = ()

	get_startswith = lru_cache(maxsize=_DEFAULT_MAXSIZE, typed=False)(FuzzyMatcher.get_startswith)
	get_contains = lru_cache(maxsize=_DEFAULT_MAXSIZE, typed=False)(FuzzyMatcher.get_contains)

	get_close_matches = lru_cache(maxsize=_DEFAULT_MAXSIZE, typed=False)(FuzzyMatcher.get_close_matches)
	get_sorted_matches = lru_cache(maxsize=_DEFAULT_MAXSIZE, typed=False)(FuzzyMatcher.get_sorted_matches)


	class FuzzySet(frozenset, CachedFuzzyMatcher):

	"""A frozenset class that can return fuzzy matches."""

	def __init__(self, it=(), cutoff=0.8):
	super(FuzzySet, self).__init__(it)
	self.cutoff = cutoff


	class FuzzyTuple(tuple, CachedFuzzyMatcher):

	"""A tuple class that can return fuzzy matches.

	This is useful if you want results returned in a certain order.
	"""

	def __init__(self, it=(), cutoff=0.8):
	super(FuzzyTuple, self).__init__(it)
	self.cutoff = cutoff