Skip to content

Instantly share code, notes, and snippets.

@greatsharma
Last active August 28, 2020 06:00
Show Gist options
  • Save greatsharma/894c032b214c5116597b3051a64570df to your computer and use it in GitHub Desktop.
Save greatsharma/894c032b214c5116597b3051a64570df to your computer and use it in GitHub Desktop.
Space Optimal Fuzzy Matching: This gist do a fuzzy matching of documents on the basis of levenshtein distance and returns a similarity score. This do the same as my previous gist https://gist.github.com/greatsharma/eeb22285a837a9b29431179451d0ba7f with a time complexity of O(mn) but the space complexity is now reduced to O(m) i.e., linear time,…
import gc
import numpy as np
from pprint import pprint
def levenshtein_distance_optimal(pattern, docs, ignore_case=True) -> dict:
"""Do a fuzzy matching of documents using levenshtein distance in linear space complexity
Parameters
----------
pattern : str
The document which you want to match
docs : list
The documents which you want to match with
ignore_case : bool, optional
Whether the matching is case sensitive or not (default is True)
Returns
-------
dict
a dictionary of similarity scores of all documents with pattern in the order passed in docs
"""
if ignore_case:
pattern = pattern.lower()
pattern_len = len(pattern)
similarity_score = {}
count = 1
for doc in docs:
if ignore_case:
doc = doc.lower()
if pattern == doc:
similarity_score['doc' + str(count)] = 1.0
count += 1
continue
doc_len = len(doc)
cache = [0] * (pattern_len+1)
space_penalty = 1
for i in range(pattern_len+1):
cache[i] = space_penalty*i
for i in range(1, doc_len+1):
temp_store = [0] * (pattern_len+1)
temp_store[0] = cache[0] + space_penalty
for j in range(1, pattern_len+1):
miss_penalty = cache[j-1]
if pattern[j-1] != doc[i-1]:
miss_penalty += 1
temp_store[j] = min([space_penalty+cache[j],
space_penalty+temp_store[j-1],
miss_penalty])
cache = temp_store
del temp_store
gc.collect
lev_dist = cache[pattern_len]
similarity_score['doc' + str(count)] = (pattern_len + doc_len -
lev_dist) / float(pattern_len + doc_len)
count += 1
return similarity_score
if __name__ == '__main__':
pattern = 'this is a test for fuzzy wuzzy match'
docs = ['a test for fuzzy match', 'test fuzzy matching', 'this is a test for fuzzy wuzzy match',
'this is test for fuzy wuzy match', 'this is a for fuzzy wuzzy match']
similarity_score = levenshtein_distance_optimal(pattern, docs)
pprint(similarity_score)
# output ->
# {'doc1': 0.7586206896551724,
# 'doc2': 0.5818181818181818,
# 'doc3': 1.0,
# 'doc4': 0.9411764705882353,
# 'doc5': 0.9253731343283582}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment