Created
December 10, 2015 20:17
-
-
Save AaronC81/5fa2aac61c82cac7ddf1 to your computer and use it in GitHub Desktop.
FuzzyMatch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import difflib | |
import enum | |
""" | |
FuzzyMatcher - Created by OrangeFlash81 and released under the MIT License. | |
""" | |
class EmptyMatchListError(Exception): | |
"""A simple stub exception thrown when ``FuzzyMatcher.match_list`` is empty.""" | |
pass | |
class FuzzyMatcherBuiltins(enum.Enum): | |
"""Represents preset data that can be used as a ``FuzzyMatcher.match_list``. | |
Values: | |
example_data: The list ``["John", "Steven", "Paul"]``. | |
english_dictionary: A list of English words from an external source. I am not responsible for the content of this file! | |
""" | |
EXAMPLE_DATA = 1 | |
ENGLISH_DICTIONARY = 2 | |
class FuzzyMatcher: | |
"""A class that allows for matching a single string to one in a list of strings. | |
This uses "fuzzy" string matching, which matches the string to another based on similarity. | |
Examples:: | |
>>> from fuzzymatch import FuzzyMatcher | |
>>> fm = FuzzyMatcher(["John", "Steven", "Paul"]) | |
>>> fm.closest("Jon") | |
'John' | |
>>> fm.closest("Stephanie") | |
'Steven' | |
>>> fm.threshold = 0.6 # We can make ``closest`` be a bit stricter | |
>>> print(fm.closest("Stephanie")) | |
None | |
>>> fm.map_to("Jon") | |
[('John', 0.8571428571428571), ('Steven', 0.2222222222222222), ('Jeremy', 0.0)] | |
>>> fm.map_to("Jon", False) | |
[0.8571428571428571, 0.2222222222222222, 0.0] | |
""" | |
def __init__(self, match_list, **kwargs): | |
"""Args: | |
match_list(aint): The list to match strings against. | |
Keyword Args: | |
threshold(float): An accuracy value where, if the accuracy of a match is below this, that string is considered to not be a match. Default -1. | |
""" | |
if type(match_list) == list: | |
self.match_list = match_list | |
elif type(match_list) == FuzzyMatcherBuiltins: | |
try: | |
if match_list == FuzzyMatcherBuiltins.EXAMPLE_DATA: | |
self.match_list = ["John", "Steven", "Paul"] | |
elif match_list == FuzzyMatcherBuiltins.ENGLISH_DICTIONARY: | |
raise NotImplementedError | |
else: raise ValueError("When match_list is an integer, it must be a value from FuzzyMatcherBuiltins.") | |
except: raise ValueError("When match_list is an integer, it must be a value from FuzzyMatcherBuiltins.") | |
else: raise TypeError("match_list must be either a list or a value from FuzzyMatcherBuiltins.") | |
self.threshold = kwargs.get("threshold", -1) | |
def closest(self, to_match): | |
"""Matches the ``to_match`` string to its closest equivalent in the ``match_list`` property. | |
Args: | |
to_match(str): The string to match. | |
Returns: | |
str: If the match was successful, this is the closest string in ``match_list`` to ``to_match``. | |
None: If the match failed to find a close enough match, or if the match's accuracy was less than ``threshold``. | |
Raises: | |
EmptyMatchListError: If the ``match_list`` property is empty. | |
""" | |
if not any(self.match_list): raise EmptyMatchListError("The match_list of the FuzzyMatcher instance is an empty list; match_list should contain at least 1 item.") | |
current_best_match = (None, 0) | |
for string in self.match_list: | |
seq = difflib.SequenceMatcher(None, to_match, string).ratio() | |
if seq > self.threshold and seq > current_best_match[1]: current_best_match = (string, seq) | |
return (current_best_match[0] if current_best_match[0] != None else None) | |
def map_to(self, to_match, should_include_strings=True): | |
"""Looks at all values in ``match_list`` and compares them to ``to_match``. | |
Args: | |
to_match(str): The string to match. | |
should_include_strings(bool): If True, the returned list contains tuples, containing both the string from ``match_list`` and the accuracy. If False, the returned list just contains accuracy. Default True. | |
Returns: | |
list: A list of tuples containing compared string and accuracy if should_include_strings is true, otherwise a list of accuracies as floats. The tuples or floats are listed in the same order as ``match_list``. | |
Raises: | |
EmptyMatchListError: If the ``match_list`` property is empty. | |
""" | |
if not any(self.match_list): raise EmptyMatchListError | |
matches = [] | |
for string in self.match_list: | |
seq = difflib.SequenceMatcher(None, to_match, string).ratio() | |
matches.append((string, seq) if should_include_strings else seq) | |
return matches |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment