Created
February 8, 2011 20:01
-
-
Save johnpena/817090 to your computer and use it in GitHub Desktop.
Levenshtein distance gives you a basic idea of how similar two strings are.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def levenshtein_distance(first, second): | |
""" | |
Find the Levenshtein distance between two strings. | |
Levenshtein distance gives you a basic idea of how similar two strings are. | |
""" | |
if len(first) > len(second): | |
first, second = second, first | |
if len(second) == 0: | |
return len(first) | |
first_length = len(first) + 1 | |
second_length = len(second) + 1 | |
distance_matrix = [[0] * second_length for x in xrange(first_length)] | |
for i in xrange(first_length): | |
distance_matrix[i][0] = i | |
for j in xrange(second_length): | |
distance_matrix[0][j]=j | |
for i in xrange(1, first_length): | |
for j in xrange(1, second_length): | |
deletion = distance_matrix[i-1][j] + 1 | |
insertion = distance_matrix[i][j-1] + 1 | |
substitution = distance_matrix[i-1][j-1] | |
if first[i-1] != second[j-1]: | |
substitution += 1 | |
distance_matrix[i][j] = min(insertion, deletion, substitution) | |
return distance_matrix[first_length-1][second_length-1] | |
def lcs_len(s1, s2): | |
"""Finds the length of the longest common substring between s1 and s2""" | |
m = len(s1) | |
n = len(s2) | |
L = [[0] * (n+1) for i in xrange(m+1)] | |
lcs = 0 | |
for i in xrange(m): | |
for j in xrange(n): | |
if s1[i] == s2[j]: | |
L[i+1][j+1] = L[i][j] + 1 | |
lcs = max(lcs, L[i+1][j+1]) | |
return lcs | |
def lcs_set(s1, s2): | |
"""Finds the set of longest common substrings in s1 and s2""" | |
m = len(s1) | |
n = len(s2) | |
L = [[0] * (n+1) for i in xrange(m+1)] | |
LCS = set() | |
longest = 0 | |
for i in xrange(m): | |
for j in xrange(n): | |
if s1[i] == s2[j]: | |
v = L[i][j] + 1 | |
L[i+1][j+1] = v | |
if v > longest: | |
longest = v | |
LCS = set() | |
if v == longest: | |
LCS.add(s1[i-v+1:i+1]) | |
return LCS |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment