Skip to content

Instantly share code, notes, and snippets.

# lopespm/levenshtein_regex_recursion_memo.py Last active May 1, 2019

Levenshtein distance between regex expression and target string - Recursion with memoization (Python)
 # (Variant #4 for exercise 16.2 on EPI (Elements of Programming Interviews)) (September 2018 edition) # The core idea is calculate the levenshtein distance, while taking into account the special cases of the regex expression # *, +, ? and . were taken into account for the regex expression. Expression blocks are not supported # This algorithm uses recursion with memoization (could be transposed to a DP solution), yielding a O(mn) time complexity, O(mn) auxiliary space for cache and O(max(m,n)) function call stack # (m and n are the lengths of regex and target strings respectively) # # Version using dynamic programming: https://gist.github.com/lopespm/2362a77e7bd230a4622a43709c195826 def regex_dist(regex: str, target: str): def regex_dist_aux(r_i, t_i): if (r_i == -1 and t_i == -1): return 0 if (r_i == -1): return t_i + 1 if (t_i == -1): i, counter = r_i, 0 while i >= 0: char = regex[i] i -= 2 if (char == '?' or char == '*' or char == '+') else 1 counter += 1 if (not (char == '?' or char == '*')) else 0 return counter if (memo[r_i][t_i] is not None): return memo[r_i][t_i] # Regex special cases if (regex[r_i] == '.'): memo[r_i][t_i] = regex_dist_aux(r_i - 1, t_i - 1) return memo[r_i][t_i] if (regex[r_i] == '+' or regex[r_i] == '*' or regex[r_i] == '?'): if (regex[r_i-1] == target[t_i]): if (regex[r_i] == '?'): memo[r_i][t_i] = regex_dist_aux(r_i - 2, t_i - 1) else: memo[r_i][t_i] = min(regex_dist_aux(r_i - 2, t_i - 1), regex_dist_aux(r_i, t_i - 1)) else: additional_cost = 1 if (regex[r_i] == '+') else 0 memo[r_i][t_i] = min(regex_dist_aux(r_i - 2, t_i - 1) + 1, regex_dist_aux(r_i, t_i - 1) + 1, regex_dist_aux(r_i - 2, t_i) + additional_cost) return memo[r_i][t_i] # Other characters if (regex[r_i] == target[t_i]): memo[r_i][t_i] = regex_dist_aux(r_i - 1, t_i - 1) else: memo[r_i][t_i] = min(regex_dist_aux(r_i - 1, t_i - 1) + 1, regex_dist_aux(r_i, t_i - 1) + 1, regex_dist_aux(r_i - 1, t_i) + 1) return memo[r_i][t_i] memo = [[None] * (len(target) + 1) for _ in range(len(regex) + 1)] return regex_dist_aux(len(regex) - 1, len(target) -1) # Some examples print(regex_dist("OrchestraaaQA+a", "CarthorseQAAAA")) print(regex_dist("A+b", "AAb")) print(regex_dist("A+b", "AAAAAb")) print(regex_dist("A+b", "AAAAAb03b")) print(regex_dist("A..b", "AAAAAb03b")) print(regex_dist("q+", "A")) print(regex_dist("q+a?a+", "A")) print(regex_dist("q+a?a+A+", "A")) print(regex_dist("q+a?a+A+.", "A")) print(regex_dist("q+A", "AAAAAb03b"))
to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.