Adrianval96/levenshtein_distance.py

## levenshtein_distance.py
import numpy as np

def levenshtein_ratio_and_distance(s, t, ratio_calc = False):
    """ levenshtein_ratio_and_distance:
        Calculates levenshtein distance between two strings.
        If ratio_calc = True, the function computes the
        levenshtein distance ratio of similarity between two strings
        For all i and j, distance[i,j] will contain the Levenshtein
        distance between the first i characters of s and the
        first j characters of t
    """
    # Initialize matrix of zeros
    rows = len(s)+1
    cols = len(t)+1
    distance = np.zeros((rows,cols),dtype = int)

    # Populate matrix of zeros with the indeces of each character of both strings
    for i in range(1, rows):
        for k in range(1,cols):
            distance[i][0] = i
            distance[0][k] = k

    # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions
    for col in range(1, cols):
        for row in range(1, rows):
            if s[row-1] == t[col-1]:
                cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
            else:
                # In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio
                # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.
                if ratio_calc == True:
                    cost = 2
                else:
                    cost = 1
            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of deletions
                                 distance[row][col-1] + 1,          # Cost of insertions
                                 distance[row-1][col-1] + cost)     # Cost of substitutions
    if ratio_calc == True:
        # Computation of the Levenshtein Distance Ratio
        Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
        return Ratio
    else:
        # print(distance) # Uncomment if you want to see the matrix showing how the algorithm computes the cost of deletions,
        # insertions and/or substitutions
        # This is the minimum number of edits needed to convert string a to string b
        return distance[row][col]
        #return "The strings are {} edits away".format(distance[row][col])

## word_similarity.py
import levenshtein_distance as ld

class Dictionary:
    def __init__(self, words):
        self.words = words

    def find_most_similar(self, term):
        best_word = {"name": "", "similarity": 100}
        for word in self.words:
            res = ld.levenshtein_ratio_and_distance(word, term, ratio_calc=False)
            if res < best_word.get("similarity"):
                best_word["name"] = word
                best_word["similarity"] = res
        return best_word.get("name")

def __init__():
    fruits = Dictionary(['cherry', 'pineapple', 'melon', 'strawberry', 'raspberry'])
    print(fruits.find_most_similar('strawbery'))
    print(fruits.find_most_similar('berry'))

    things = Dictionary(['stars', 'mars', 'wars', 'codec', 'codewars'])
    print(things.find_most_similar('coddwars'))

    languages = Dictionary(['javascript', 'java', 'ruby', 'php', 'python', 'coffeescript'])
    print(languages.find_most_similar('heaven'))
    print(languages.find_most_similar('javascript'))

__init__()
	import numpy as np

	def levenshtein_ratio_and_distance(s, t, ratio_calc = False):
	""" levenshtein_ratio_and_distance:
	Calculates levenshtein distance between two strings.
	If ratio_calc = True, the function computes the
	levenshtein distance ratio of similarity between two strings
	For all i and j, distance[i,j] will contain the Levenshtein
	distance between the first i characters of s and the
	first j characters of t
	"""
	# Initialize matrix of zeros
	rows = len(s)+1
	cols = len(t)+1
	distance = np.zeros((rows,cols),dtype = int)

	# Populate matrix of zeros with the indeces of each character of both strings
	for i in range(1, rows):
	for k in range(1,cols):
	distance[i][0] = i
	distance[0][k] = k

	# Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions
	for col in range(1, cols):
	for row in range(1, rows):
	if s[row-1] == t[col-1]:
	cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
	else:
	# In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio
	# the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.
	if ratio_calc == True:
	cost = 2
	else:
	cost = 1
	distance[row][col] = min(distance[row-1][col] + 1, # Cost of deletions
	distance[row][col-1] + 1, # Cost of insertions
	distance[row-1][col-1] + cost) # Cost of substitutions
	if ratio_calc == True:
	# Computation of the Levenshtein Distance Ratio
	Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
	return Ratio
	else:
	# print(distance) # Uncomment if you want to see the matrix showing how the algorithm computes the cost of deletions,
	# insertions and/or substitutions
	# This is the minimum number of edits needed to convert string a to string b
	return distance[row][col]
	#return "The strings are {} edits away".format(distance[row][col])
	import levenshtein_distance as ld

	class Dictionary:
	def __init__(self, words):
	self.words = words

	def find_most_similar(self, term):
	best_word = {"name": "", "similarity": 100}
	for word in self.words:
	res = ld.levenshtein_ratio_and_distance(word, term, ratio_calc=False)
	if res < best_word.get("similarity"):
	best_word["name"] = word
	best_word["similarity"] = res
	return best_word.get("name")

	def __init__():
	fruits = Dictionary(['cherry', 'pineapple', 'melon', 'strawberry', 'raspberry'])
	print(fruits.find_most_similar('strawbery'))
	print(fruits.find_most_similar('berry'))

	things = Dictionary(['stars', 'mars', 'wars', 'codec', 'codewars'])
	print(things.find_most_similar('coddwars'))

	languages = Dictionary(['javascript', 'java', 'ruby', 'php', 'python', 'coffeescript'])
	print(languages.find_most_similar('heaven'))
	print(languages.find_most_similar('javascript'))

	__init__()