Skip to content

Instantly share code, notes, and snippets.

@AaronC81
Created December 10, 2015 20:17
Show Gist options
  • Save AaronC81/5fa2aac61c82cac7ddf1 to your computer and use it in GitHub Desktop.
Save AaronC81/5fa2aac61c82cac7ddf1 to your computer and use it in GitHub Desktop.
FuzzyMatch
import difflib
import enum
"""
FuzzyMatcher - Created by OrangeFlash81 and released under the MIT License.
"""
class EmptyMatchListError(Exception):
"""A simple stub exception thrown when ``FuzzyMatcher.match_list`` is empty."""
pass
class FuzzyMatcherBuiltins(enum.Enum):
"""Represents preset data that can be used as a ``FuzzyMatcher.match_list``.
Values:
example_data: The list ``["John", "Steven", "Paul"]``.
english_dictionary: A list of English words from an external source. I am not responsible for the content of this file!
"""
EXAMPLE_DATA = 1
ENGLISH_DICTIONARY = 2
class FuzzyMatcher:
"""A class that allows for matching a single string to one in a list of strings.
This uses "fuzzy" string matching, which matches the string to another based on similarity.
Examples::
>>> from fuzzymatch import FuzzyMatcher
>>> fm = FuzzyMatcher(["John", "Steven", "Paul"])
>>> fm.closest("Jon")
'John'
>>> fm.closest("Stephanie")
'Steven'
>>> fm.threshold = 0.6 # We can make ``closest`` be a bit stricter
>>> print(fm.closest("Stephanie"))
None
>>> fm.map_to("Jon")
[('John', 0.8571428571428571), ('Steven', 0.2222222222222222), ('Jeremy', 0.0)]
>>> fm.map_to("Jon", False)
[0.8571428571428571, 0.2222222222222222, 0.0]
"""
def __init__(self, match_list, **kwargs):
"""Args:
match_list(aint): The list to match strings against.
Keyword Args:
threshold(float): An accuracy value where, if the accuracy of a match is below this, that string is considered to not be a match. Default -1.
"""
if type(match_list) == list:
self.match_list = match_list
elif type(match_list) == FuzzyMatcherBuiltins:
try:
if match_list == FuzzyMatcherBuiltins.EXAMPLE_DATA:
self.match_list = ["John", "Steven", "Paul"]
elif match_list == FuzzyMatcherBuiltins.ENGLISH_DICTIONARY:
raise NotImplementedError
else: raise ValueError("When match_list is an integer, it must be a value from FuzzyMatcherBuiltins.")
except: raise ValueError("When match_list is an integer, it must be a value from FuzzyMatcherBuiltins.")
else: raise TypeError("match_list must be either a list or a value from FuzzyMatcherBuiltins.")
self.threshold = kwargs.get("threshold", -1)
def closest(self, to_match):
"""Matches the ``to_match`` string to its closest equivalent in the ``match_list`` property.
Args:
to_match(str): The string to match.
Returns:
str: If the match was successful, this is the closest string in ``match_list`` to ``to_match``.
None: If the match failed to find a close enough match, or if the match's accuracy was less than ``threshold``.
Raises:
EmptyMatchListError: If the ``match_list`` property is empty.
"""
if not any(self.match_list): raise EmptyMatchListError("The match_list of the FuzzyMatcher instance is an empty list; match_list should contain at least 1 item.")
current_best_match = (None, 0)
for string in self.match_list:
seq = difflib.SequenceMatcher(None, to_match, string).ratio()
if seq > self.threshold and seq > current_best_match[1]: current_best_match = (string, seq)
return (current_best_match[0] if current_best_match[0] != None else None)
def map_to(self, to_match, should_include_strings=True):
"""Looks at all values in ``match_list`` and compares them to ``to_match``.
Args:
to_match(str): The string to match.
should_include_strings(bool): If True, the returned list contains tuples, containing both the string from ``match_list`` and the accuracy. If False, the returned list just contains accuracy. Default True.
Returns:
list: A list of tuples containing compared string and accuracy if should_include_strings is true, otherwise a list of accuracies as floats. The tuples or floats are listed in the same order as ``match_list``.
Raises:
EmptyMatchListError: If the ``match_list`` property is empty.
"""
if not any(self.match_list): raise EmptyMatchListError
matches = []
for string in self.match_list:
seq = difflib.SequenceMatcher(None, to_match, string).ratio()
matches.append((string, seq) if should_include_strings else seq)
return matches
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment