Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Levenshtein distance between regex expression and target string - Dynamic Programming (Python)
# (Variant #4 for exercise 16.2 on EPI (Elements of Programming Interviews)) (September 2018 edition)
# The core idea is calculate the levenshtein distance, while taking into account the special cases of the regex expression
# *, +, ? and . were taken into account for the regex expression. Expression blocks are not supported
# This algorithm uses dynamic programming, yielding a O(mn) time complexity, O(m) auxiliary space for cache
# (m and n are the lengths of regex and target strings respectively)
#
# Version using recursion with memoization: https://gist.github.com/lopespm/53a215d0b2b0518b52b6bb6687bdaff6
def regex_dist(regex: str, target: str):
current = [None] * (len(regex) + 1)
for t_i in range(len(target) + 1):
prev = [item for item in current]
for r_i in range(len(regex) + 1):
if (r_i == 0 and t_i == 0):
current[r_i] = 0
elif (r_i == 0):
current[r_i] = t_i
elif (t_i == 0):
i, counter = r_i - 1, 0
while i >= 0:
char = regex[i]
i -= 2 if (char == '?' or char == '*' or char == '+') else 1
counter += 1 if (not (char == '?' or char == '*')) else 0
current[r_i] = counter
# Regex special cases
elif (regex[r_i-1] == '.'):
current[r_i] = prev[r_i - 1]
elif (regex[r_i-1] == '+' or regex[r_i-1] == '*' or regex[r_i-1] == '?'):
if (regex[r_i-1-1] == target[t_i-1]):
if (regex[r_i-1] == '?'):
current[r_i] = prev[r_i - 2]
else:
current[r_i] = min(prev[r_i - 2], prev[r_i])
else:
additional_cost = 1 if (regex[r_i-1] == '+') else 0
current[r_i] = min(prev[r_i - 2] + 1,
prev[r_i] + 1,
current[r_i - 2] + additional_cost)
# Other characters
elif (regex[r_i-1] == target[t_i-1]):
current[r_i] = prev[r_i - 1]
else:
current[r_i] = min(prev[r_i - 1] + 1,
prev[r_i] + 1,
current[r_i - 1] + 1)
return current[-1]
# Some examples
print(regex_dist("OrchestraaaQA+a", "CarthorseQAAAA"))
print(regex_dist("A+b", "AAb"))
print(regex_dist("A+b", "AAAAAb"))
print(regex_dist("A+b", "AAAAAb03b"))
print(regex_dist("A..b", "AAAAAb03b"))
print(regex_dist("q+", "A"))
print(regex_dist("q+a?a+", "A"))
print(regex_dist("q+a?a+A+", "A"))
print(regex_dist("q+a?a+A+.", "A"))
print(regex_dist("q+A", "AAAAAb03b"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.