greatsharma/fuzzymatch_spaceOptimal.py

## fuzzymatch_spaceOptimal.py
import gc
import numpy as np
from pprint import pprint


def levenshtein_distance_optimal(pattern, docs, ignore_case=True) -> dict:
    """Do a fuzzy matching of documents using levenshtein distance in linear space complexity

    Parameters
    ----------
    pattern : str
        The document which you want to match
    docs : list
        The documents which you want to match with
    ignore_case : bool, optional
        Whether the matching is case sensitive or not (default is True)

    Returns
    -------
    dict
        a dictionary of similarity scores of all documents with pattern in the order passed in docs
    """

    if ignore_case:
        pattern = pattern.lower()

    pattern_len = len(pattern)
    similarity_score = {}
    count = 1

    for doc in docs:

        if ignore_case:
            doc = doc.lower()

        if pattern == doc:
            similarity_score['doc' + str(count)] = 1.0
            count += 1
            continue

        doc_len = len(doc)
        cache = [0] * (pattern_len+1)
        space_penalty = 1

        for i in range(pattern_len+1):
            cache[i] = space_penalty*i

        for i in range(1, doc_len+1):

            temp_store = [0] * (pattern_len+1)
            temp_store[0] = cache[0] + space_penalty

            for j in range(1, pattern_len+1):

                miss_penalty = cache[j-1]

                if pattern[j-1] != doc[i-1]:
                    miss_penalty += 1

                temp_store[j] = min([space_penalty+cache[j],
                                     space_penalty+temp_store[j-1],
                                     miss_penalty])

            cache = temp_store
            del temp_store
            gc.collect

        lev_dist = cache[pattern_len]
        similarity_score['doc' + str(count)] = (pattern_len + doc_len -
                                                lev_dist) / float(pattern_len + doc_len)
        count += 1

    return similarity_score


if __name__ == '__main__':

    pattern = 'this is a test for fuzzy wuzzy match'
    docs = ['a test for fuzzy match', 'test fuzzy matching', 'this is a test for fuzzy wuzzy match',
            'this is test for fuzy wuzy match', 'this is a for fuzzy wuzzy match']

    similarity_score = levenshtein_distance_optimal(pattern, docs)
    pprint(similarity_score)

    # output ->
    # {'doc1': 0.7586206896551724,
    # 'doc2': 0.5818181818181818,
    # 'doc3': 1.0,
    # 'doc4': 0.9411764705882353,
    # 'doc5': 0.9253731343283582}
	import gc
	import numpy as np
	from pprint import pprint


	def levenshtein_distance_optimal(pattern, docs, ignore_case=True) -> dict:
	"""Do a fuzzy matching of documents using levenshtein distance in linear space complexity

	Parameters
	----------
	pattern : str
	The document which you want to match
	docs : list
	The documents which you want to match with
	ignore_case : bool, optional
	Whether the matching is case sensitive or not (default is True)

	Returns
	-------
	dict
	a dictionary of similarity scores of all documents with pattern in the order passed in docs
	"""

	if ignore_case:
	pattern = pattern.lower()

	pattern_len = len(pattern)
	similarity_score = {}
	count = 1

	for doc in docs:

	if ignore_case:
	doc = doc.lower()

	if pattern == doc:
	similarity_score['doc' + str(count)] = 1.0
	count += 1
	continue

	doc_len = len(doc)
	cache = [0] * (pattern_len+1)
	space_penalty = 1

	for i in range(pattern_len+1):
	cache[i] = space_penalty*i

	for i in range(1, doc_len+1):

	temp_store = [0] * (pattern_len+1)
	temp_store[0] = cache[0] + space_penalty

	for j in range(1, pattern_len+1):

	miss_penalty = cache[j-1]

	if pattern[j-1] != doc[i-1]:
	miss_penalty += 1

	temp_store[j] = min([space_penalty+cache[j],
	space_penalty+temp_store[j-1],
	miss_penalty])

	cache = temp_store
	del temp_store
	gc.collect

	lev_dist = cache[pattern_len]
	similarity_score['doc' + str(count)] = (pattern_len + doc_len -
	lev_dist) / float(pattern_len + doc_len)
	count += 1

	return similarity_score


	if __name__ == '__main__':

	pattern = 'this is a test for fuzzy wuzzy match'
	docs = ['a test for fuzzy match', 'test fuzzy matching', 'this is a test for fuzzy wuzzy match',
	'this is test for fuzy wuzy match', 'this is a for fuzzy wuzzy match']

	similarity_score = levenshtein_distance_optimal(pattern, docs)
	pprint(similarity_score)

	# output ->
	# {'doc1': 0.7586206896551724,
	# 'doc2': 0.5818181818181818,
	# 'doc3': 1.0,
	# 'doc4': 0.9411764705882353,
	# 'doc5': 0.9253731343283582}