Created
November 15, 2018 20:09
-
-
Save zurk/b37f630fa8232732f1e9fe08ef27b767 to your computer and use it in GitHub Desktop.
Liechtenstein distance calculation with edit sequence. Align 3 sequences.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Action: | |
equal = 0 | |
replace = 1 | |
insert = 2 | |
delete = 3 | |
Action.names = {i: name for name, i in Action.__dict__.items() if name[0] != "_"} | |
def align3(seq1, seq2, seq3): | |
aseq1, aseq2 = align(seq1, seq2) | |
bseq2, bseq3 = align(seq2, seq3) | |
cseq1, cseq3 = align(aseq1, bseq3) | |
dseq2, dseq3 = align(bseq2, cseq3) | |
return cseq1, dseq2, dseq3 | |
def align(seq1, seq2): | |
for start, s in enumerate(zip(seq1, seq2)): | |
if s[0] != s[1]: | |
break | |
for end, s in enumerate(zip(seq1[::-1], seq2[::-1])): | |
if s[0] != s[1]: | |
break | |
seq1_mid = seq1[start:end] | |
seq2_mid = seq2[start:end] | |
op_codes = levenshtein(seq1_mid, seq2_mid, with_equal=True) | |
res1 = [] | |
res2 = [] | |
for action, i, j in op_codes: | |
if action == "equal" or action == "replace": | |
res1.append(seq1_mid[i]) | |
res2.append(seq2_mid[j]) | |
if action == "insert": | |
res1.append(EMPTY) | |
res2.append(seq2_mid[j]) | |
if action == "delete": | |
res1.append(seq1_mid[i]) | |
res2.append(EMPTY) | |
return seq1[:start] + "".join(res1) + seq1[end:], seq2[:start] + "".join(res2) + seq2[end:] | |
def get_opcodes(actions: numpy.ndarray, with_equal): | |
""" | |
Build the best path opcodes for given a actions matrix. | |
""" | |
i = actions.shape[0] - 1 | |
j = actions.shape[1] - 1 | |
opcodes = [] | |
# from matplotlib import pyplot as plt | |
# actions2 = actions / 255.0 | |
# actions2[actions2 == 1] = float("nan") | |
# plt.matshow(actions2) | |
# plt.show() | |
while i != 0 or j != 0: | |
action = actions[i][j] | |
if with_equal or action != Action.equal: | |
opcodes.append((Action.names[action], i-1, j-1)) | |
if action == Action.equal or action == Action.replace: | |
i -= 1 | |
j -= 1 | |
elif action == Action.insert: | |
j -= 1 | |
else: | |
i -= 1 | |
return opcodes[::-1] | |
def levenshtein(seq1, seq2, with_equal=False): | |
""" | |
How to transform seq1 to seq2 | |
:param seq1: | |
:param seq2: | |
:return: | |
""" | |
max_distance = 500 | |
if seq1 == seq2: | |
return [(Action.names[Action.equal], i, i) for i in range(len(seq1))] if with_equal else [] | |
v0 = numpy.arange(start=0, stop=len(seq2) + 1, step=1, dtype=numpy.uint32) | |
v1 = numpy.empty(shape=(len(seq2) + 1,), dtype=numpy.uint32) | |
actions = numpy.zeros(shape=(len(seq1) + 1, len(seq2) + 1), dtype=numpy.uint8) | |
# default action is equal | |
actions[0, :] = Action.insert | |
actions[:, 0] = Action.delete | |
for i, seq1_i in enumerate(seq1): | |
v1[0] = i | |
for j, seq2_j in enumerate(seq2): | |
if abs(i - j) > max_distance: | |
# Optimization | |
v1[j + 1] = max_distance + 1 | |
continue | |
cost = 0 if seq1_i == seq2_j else 1 | |
replace_cost = v0[j] + cost | |
insert_cost = v1[j] + 1 | |
delete_cost = v0[j+1] + 1 | |
if replace_cost <= insert_cost and replace_cost <= delete_cost: | |
v1[j+1] = replace_cost | |
if cost: | |
actions[i+1, j+1] = Action.replace | |
elif insert_cost <= delete_cost: | |
v1[j+1] = insert_cost | |
actions[i+1, j+1] = Action.insert | |
else: | |
v1[j+1] = delete_cost | |
actions[i+1, j+1] = Action.delete | |
v0, v1 = v1, v0 | |
return get_opcodes(actions, with_equal=with_equal) | |
def metrics(init, correct, model_out): | |
init, correct, model_out = align3(init, correct, model_out) | |
detected_bad_change = 0 | |
detected_good_change = 0 | |
misdetection = 0 | |
undetected = 0 | |
for init_c, correct_c, model_out_c in zip(init, correct, model_out): | |
if init_c == correct_c == model_out_c: | |
continue | |
elif init_c == correct_c and init_c != model_out_c: | |
misdetection += 1 | |
elif init_c == model_out_c and init_c != correct_c: | |
undetected += 1 | |
elif correct_c == model_out_c and init_c != correct_c: | |
detected_good_change += 1 | |
else: | |
detected_bad_change += 1 | |
return misdetection, undetected, detected_bad_change, detected_good_change |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment