greatsharma/fuzzymatch.py

## fuzzymatch.py
import numpy as np
from pprint import pprint


def levenshtein_distance(pattern, docs, ignore_case=True) -> dict:
    """Do a fuzzy matching of documents using levenshtein distance.

    Parameters
    ----------
    pattern : str
        The document which you want to match
    docs : list
        The documents which you want to match with
    ignore_case : bool, optional
        Whether the matching is case sensitive or not (default is True)

    Returns
    -------
    dict
        a dictionary of similarity scores of all documents with pattern in the order passed in docs
    """

    if ignore_case:
        pattern = pattern.lower()

    pattern_len = len(pattern)
    similarity_score = {}
    count = 1

    for doc in docs:

        if ignore_case:
            doc = doc.lower()

        if pattern == doc:
            similarity_score['doc' + str(count)] = 1.0
            count += 1
            continue

        doc_len = len(doc)

        cache_table = np.empty(shape=(doc_len+1, pattern_len+1))
        cache_table[0][0] = 0

        space_penalty = 1

        for j in range(1, pattern_len+1):
            cache_table[0][j] = space_penalty*j

        for i in range(1, doc_len+1):
            cache_table[i][0] = space_penalty*i

        for i in range(1, doc_len+1):

            for j in range(1, pattern_len+1):

                miss_penalty = cache_table[i-1][j-1]

                if pattern[j-1] != doc[i-1]:
                    miss_penalty += 1

                cache_table[i][j] = min([space_penalty+cache_table[i-1][j],
                                         space_penalty+cache_table[i][j-1],
                                         miss_penalty])

        lev_dist = cache_table[doc_len][pattern_len]
        similarity_score['doc' + str(count)] = (pattern_len + doc_len -
                                                lev_dist) / (pattern_len + doc_len)
        count += 1

    return similarity_score


if __name__ == '__main__':

    pattern = 'this is a test for fuzzy wuzzy match'
    docs = ['a test for fuzzy match', 'test fuzzy matching', 'this is a test for fuzzy wuzzy match',
            'this is test for fuzy wuzy match', 'this is a for fuzzy wuzzy match']

    similarity_score = levenshtein_distance(pattern, docs)
    pprint(similarity_score)

    # output ->
    # {'doc1': 0.7586206896551724,
    # 'doc2': 0.5818181818181818,
    # 'doc3': 1.0,
    # 'doc4': 0.9411764705882353,
    # 'doc5': 0.9253731343283582}
	import numpy as np
	from pprint import pprint


	def levenshtein_distance(pattern, docs, ignore_case=True) -> dict:
	"""Do a fuzzy matching of documents using levenshtein distance.

	Parameters
	----------
	pattern : str
	The document which you want to match
	docs : list
	The documents which you want to match with
	ignore_case : bool, optional
	Whether the matching is case sensitive or not (default is True)

	Returns
	-------
	dict
	a dictionary of similarity scores of all documents with pattern in the order passed in docs
	"""

	if ignore_case:
	pattern = pattern.lower()

	pattern_len = len(pattern)
	similarity_score = {}
	count = 1

	for doc in docs:

	if ignore_case:
	doc = doc.lower()

	if pattern == doc:
	similarity_score['doc' + str(count)] = 1.0
	count += 1
	continue

	doc_len = len(doc)

	cache_table = np.empty(shape=(doc_len+1, pattern_len+1))
	cache_table[0][0] = 0

	space_penalty = 1

	for j in range(1, pattern_len+1):
	cache_table[0][j] = space_penalty*j

	for i in range(1, doc_len+1):
	cache_table[i][0] = space_penalty*i

	for i in range(1, doc_len+1):

	for j in range(1, pattern_len+1):

	miss_penalty = cache_table[i-1][j-1]

	if pattern[j-1] != doc[i-1]:
	miss_penalty += 1

	cache_table[i][j] = min([space_penalty+cache_table[i-1][j],
	space_penalty+cache_table[i][j-1],
	miss_penalty])

	lev_dist = cache_table[doc_len][pattern_len]
	similarity_score['doc' + str(count)] = (pattern_len + doc_len -
	lev_dist) / (pattern_len + doc_len)
	count += 1

	return similarity_score


	if __name__ == '__main__':

	pattern = 'this is a test for fuzzy wuzzy match'
	docs = ['a test for fuzzy match', 'test fuzzy matching', 'this is a test for fuzzy wuzzy match',
	'this is test for fuzy wuzy match', 'this is a for fuzzy wuzzy match']

	similarity_score = levenshtein_distance(pattern, docs)
	pprint(similarity_score)

	# output ->
	# {'doc1': 0.7586206896551724,
	# 'doc2': 0.5818181818181818,
	# 'doc3': 1.0,
	# 'doc4': 0.9411764705882353,
	# 'doc5': 0.9253731343283582}