Skip to content

Instantly share code, notes, and snippets.

@Adrianval96
Created October 6, 2019 20:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Adrianval96/c931555796b95128401d34e9cf74159a to your computer and use it in GitHub Desktop.
Save Adrianval96/c931555796b95128401d34e9cf74159a to your computer and use it in GitHub Desktop.
Module that implements a levenshtein distance algorithm to check for word similarity
import numpy as np
def levenshtein_ratio_and_distance(s, t, ratio_calc = False):
""" levenshtein_ratio_and_distance:
Calculates levenshtein distance between two strings.
If ratio_calc = True, the function computes the
levenshtein distance ratio of similarity between two strings
For all i and j, distance[i,j] will contain the Levenshtein
distance between the first i characters of s and the
first j characters of t
"""
# Initialize matrix of zeros
rows = len(s)+1
cols = len(t)+1
distance = np.zeros((rows,cols),dtype = int)
# Populate matrix of zeros with the indeces of each character of both strings
for i in range(1, rows):
for k in range(1,cols):
distance[i][0] = i
distance[0][k] = k
# Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions
for col in range(1, cols):
for row in range(1, rows):
if s[row-1] == t[col-1]:
cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
else:
# In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio
# the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.
if ratio_calc == True:
cost = 2
else:
cost = 1
distance[row][col] = min(distance[row-1][col] + 1, # Cost of deletions
distance[row][col-1] + 1, # Cost of insertions
distance[row-1][col-1] + cost) # Cost of substitutions
if ratio_calc == True:
# Computation of the Levenshtein Distance Ratio
Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
return Ratio
else:
# print(distance) # Uncomment if you want to see the matrix showing how the algorithm computes the cost of deletions,
# insertions and/or substitutions
# This is the minimum number of edits needed to convert string a to string b
return distance[row][col]
#return "The strings are {} edits away".format(distance[row][col])
import levenshtein_distance as ld
class Dictionary:
def __init__(self, words):
self.words = words
def find_most_similar(self, term):
best_word = {"name": "", "similarity": 100}
for word in self.words:
res = ld.levenshtein_ratio_and_distance(word, term, ratio_calc=False)
if res < best_word.get("similarity"):
best_word["name"] = word
best_word["similarity"] = res
return best_word.get("name")
def __init__():
fruits = Dictionary(['cherry', 'pineapple', 'melon', 'strawberry', 'raspberry'])
print(fruits.find_most_similar('strawbery'))
print(fruits.find_most_similar('berry'))
things = Dictionary(['stars', 'mars', 'wars', 'codec', 'codewars'])
print(things.find_most_similar('coddwars'))
languages = Dictionary(['javascript', 'java', 'ruby', 'php', 'python', 'coffeescript'])
print(languages.find_most_similar('heaven'))
print(languages.find_most_similar('javascript'))
__init__()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment