Skip to content

Instantly share code, notes, and snippets.

@Avlyssna
Created March 5, 2018 23:23
Show Gist options
  • Save Avlyssna/22566c2e175b3b18c41bc73fd07d3a9e to your computer and use it in GitHub Desktop.
Save Avlyssna/22566c2e175b3b18c41bc73fd07d3a9e to your computer and use it in GitHub Desktop.
# Standard library imports
import re
# Third-party imports
import editdistance
import unidecode
def convert_to_expression(string):
patterns = ['\w', '\s']
expression = ''
previous = None
for letter in string:
for pattern in patterns:
if re.match(pattern, letter):
if not previous == pattern:
previous = pattern
expression += '{}+'.format(pattern)
break
else:
previous = letter
expression += letter
return expression
def count_fuzzy_matches(needle, haystack, maximum_distance=2, minimum_length=6):
# We normalize the text to prevent skewed edit distance
needle = unidecode.unidecode(needle).lower()
haystack = haystack.lower()
# We search without fuzz if the string is too short
if len(needle) < minimum_length:
return haystack.count(needle)
expression = convert_to_expression(needle)
matches = 0
for match in re.finditer(expression, haystack):
if editdistance.eval(match.group(0), needle) <= maximum_distance:
matches += 1
return matches
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment