Skip to content

Instantly share code, notes, and snippets.

@johnpena
Created February 8, 2011 20:01
Show Gist options
  • Save johnpena/817090 to your computer and use it in GitHub Desktop.
Save johnpena/817090 to your computer and use it in GitHub Desktop.
Levenshtein distance gives you a basic idea of how similar two strings are.
def levenshtein_distance(first, second):
"""
Find the Levenshtein distance between two strings.
Levenshtein distance gives you a basic idea of how similar two strings are.
"""
if len(first) > len(second):
first, second = second, first
if len(second) == 0:
return len(first)
first_length = len(first) + 1
second_length = len(second) + 1
distance_matrix = [[0] * second_length for x in xrange(first_length)]
for i in xrange(first_length):
distance_matrix[i][0] = i
for j in xrange(second_length):
distance_matrix[0][j]=j
for i in xrange(1, first_length):
for j in xrange(1, second_length):
deletion = distance_matrix[i-1][j] + 1
insertion = distance_matrix[i][j-1] + 1
substitution = distance_matrix[i-1][j-1]
if first[i-1] != second[j-1]:
substitution += 1
distance_matrix[i][j] = min(insertion, deletion, substitution)
return distance_matrix[first_length-1][second_length-1]
def lcs_len(s1, s2):
"""Finds the length of the longest common substring between s1 and s2"""
m = len(s1)
n = len(s2)
L = [[0] * (n+1) for i in xrange(m+1)]
lcs = 0
for i in xrange(m):
for j in xrange(n):
if s1[i] == s2[j]:
L[i+1][j+1] = L[i][j] + 1
lcs = max(lcs, L[i+1][j+1])
return lcs
def lcs_set(s1, s2):
"""Finds the set of longest common substrings in s1 and s2"""
m = len(s1)
n = len(s2)
L = [[0] * (n+1) for i in xrange(m+1)]
LCS = set()
longest = 0
for i in xrange(m):
for j in xrange(n):
if s1[i] == s2[j]:
v = L[i][j] + 1
L[i+1][j+1] = v
if v > longest:
longest = v
LCS = set()
if v == longest:
LCS.add(s1[i-v+1:i+1])
return LCS
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment