Skip to content

Instantly share code, notes, and snippets.

@greatsharma
Last active September 21, 2019 06:47
Show Gist options
  • Save greatsharma/eeb22285a837a9b29431179451d0ba7f to your computer and use it in GitHub Desktop.
Save greatsharma/eeb22285a837a9b29431179451d0ba7f to your computer and use it in GitHub Desktop.
This gist do a fuzzy matching of documents on the basis of levenshtein distance and returns a similarity score. Time and space complexities are O(mn), where m and n are documents length which we match
import numpy as np
from pprint import pprint
def levenshtein_distance(pattern, docs, ignore_case=True) -> dict:
"""Do a fuzzy matching of documents using levenshtein distance.
Parameters
----------
pattern : str
The document which you want to match
docs : list
The documents which you want to match with
ignore_case : bool, optional
Whether the matching is case sensitive or not (default is True)
Returns
-------
dict
a dictionary of similarity scores of all documents with pattern in the order passed in docs
"""
if ignore_case:
pattern = pattern.lower()
pattern_len = len(pattern)
similarity_score = {}
count = 1
for doc in docs:
if ignore_case:
doc = doc.lower()
if pattern == doc:
similarity_score['doc' + str(count)] = 1.0
count += 1
continue
doc_len = len(doc)
cache_table = np.empty(shape=(doc_len+1, pattern_len+1))
cache_table[0][0] = 0
space_penalty = 1
for j in range(1, pattern_len+1):
cache_table[0][j] = space_penalty*j
for i in range(1, doc_len+1):
cache_table[i][0] = space_penalty*i
for i in range(1, doc_len+1):
for j in range(1, pattern_len+1):
miss_penalty = cache_table[i-1][j-1]
if pattern[j-1] != doc[i-1]:
miss_penalty += 1
cache_table[i][j] = min([space_penalty+cache_table[i-1][j],
space_penalty+cache_table[i][j-1],
miss_penalty])
lev_dist = cache_table[doc_len][pattern_len]
similarity_score['doc' + str(count)] = (pattern_len + doc_len -
lev_dist) / (pattern_len + doc_len)
count += 1
return similarity_score
if __name__ == '__main__':
pattern = 'this is a test for fuzzy wuzzy match'
docs = ['a test for fuzzy match', 'test fuzzy matching', 'this is a test for fuzzy wuzzy match',
'this is test for fuzy wuzy match', 'this is a for fuzzy wuzzy match']
similarity_score = levenshtein_distance(pattern, docs)
pprint(similarity_score)
# output ->
# {'doc1': 0.7586206896551724,
# 'doc2': 0.5818181818181818,
# 'doc3': 1.0,
# 'doc4': 0.9411764705882353,
# 'doc5': 0.9253731343283582}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment