Created
October 6, 2019 20:22
-
-
Save Adrianval96/c931555796b95128401d34e9cf74159a to your computer and use it in GitHub Desktop.
Module that implements a levenshtein distance algorithm to check for word similarity
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
def levenshtein_ratio_and_distance(s, t, ratio_calc = False): | |
""" levenshtein_ratio_and_distance: | |
Calculates levenshtein distance between two strings. | |
If ratio_calc = True, the function computes the | |
levenshtein distance ratio of similarity between two strings | |
For all i and j, distance[i,j] will contain the Levenshtein | |
distance between the first i characters of s and the | |
first j characters of t | |
""" | |
# Initialize matrix of zeros | |
rows = len(s)+1 | |
cols = len(t)+1 | |
distance = np.zeros((rows,cols),dtype = int) | |
# Populate matrix of zeros with the indeces of each character of both strings | |
for i in range(1, rows): | |
for k in range(1,cols): | |
distance[i][0] = i | |
distance[0][k] = k | |
# Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions | |
for col in range(1, cols): | |
for row in range(1, rows): | |
if s[row-1] == t[col-1]: | |
cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0 | |
else: | |
# In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio | |
# the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1. | |
if ratio_calc == True: | |
cost = 2 | |
else: | |
cost = 1 | |
distance[row][col] = min(distance[row-1][col] + 1, # Cost of deletions | |
distance[row][col-1] + 1, # Cost of insertions | |
distance[row-1][col-1] + cost) # Cost of substitutions | |
if ratio_calc == True: | |
# Computation of the Levenshtein Distance Ratio | |
Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t)) | |
return Ratio | |
else: | |
# print(distance) # Uncomment if you want to see the matrix showing how the algorithm computes the cost of deletions, | |
# insertions and/or substitutions | |
# This is the minimum number of edits needed to convert string a to string b | |
return distance[row][col] | |
#return "The strings are {} edits away".format(distance[row][col]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import levenshtein_distance as ld | |
class Dictionary: | |
def __init__(self, words): | |
self.words = words | |
def find_most_similar(self, term): | |
best_word = {"name": "", "similarity": 100} | |
for word in self.words: | |
res = ld.levenshtein_ratio_and_distance(word, term, ratio_calc=False) | |
if res < best_word.get("similarity"): | |
best_word["name"] = word | |
best_word["similarity"] = res | |
return best_word.get("name") | |
def __init__(): | |
fruits = Dictionary(['cherry', 'pineapple', 'melon', 'strawberry', 'raspberry']) | |
print(fruits.find_most_similar('strawbery')) | |
print(fruits.find_most_similar('berry')) | |
things = Dictionary(['stars', 'mars', 'wars', 'codec', 'codewars']) | |
print(things.find_most_similar('coddwars')) | |
languages = Dictionary(['javascript', 'java', 'ruby', 'php', 'python', 'coffeescript']) | |
print(languages.find_most_similar('heaven')) | |
print(languages.find_most_similar('javascript')) | |
__init__() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment