zurk/levenshtein.py

## levenshtein.py
class Action:
    equal = 0
    replace = 1
    insert = 2
    delete = 3


Action.names = {i: name for name, i in Action.__dict__.items() if name[0] != "_"}


def align3(seq1, seq2, seq3):
    aseq1, aseq2 = align(seq1, seq2)
    bseq2, bseq3 = align(seq2, seq3)
    cseq1, cseq3 = align(aseq1, bseq3)
    dseq2, dseq3 = align(bseq2, cseq3)
    return cseq1, dseq2, dseq3


def align(seq1, seq2):
    for start, s in enumerate(zip(seq1, seq2)):
        if s[0] != s[1]:
            break
    for end, s in enumerate(zip(seq1[::-1], seq2[::-1])):
        if s[0] != s[1]:
            break

    seq1_mid = seq1[start:end]
    seq2_mid = seq2[start:end]

    op_codes = levenshtein(seq1_mid, seq2_mid, with_equal=True)
    res1 = []
    res2 = []
    for action, i, j in op_codes:
        if action == "equal" or action == "replace":
            res1.append(seq1_mid[i])
            res2.append(seq2_mid[j])
        if action == "insert":
            res1.append(EMPTY)
            res2.append(seq2_mid[j])
        if action == "delete":
            res1.append(seq1_mid[i])
            res2.append(EMPTY)

    return seq1[:start] + "".join(res1) + seq1[end:], seq2[:start] + "".join(res2) + seq2[end:]


def get_opcodes(actions: numpy.ndarray, with_equal):
    """
    Build the best path opcodes for given a actions matrix.
    """
    i = actions.shape[0] - 1
    j = actions.shape[1] - 1
    opcodes = []
    # from matplotlib import pyplot as plt
    # actions2 = actions / 255.0
    # actions2[actions2 == 1] = float("nan")
    # plt.matshow(actions2)
    # plt.show()
    while i != 0 or j != 0:
        action = actions[i][j]
        if with_equal or action != Action.equal:
            opcodes.append((Action.names[action], i-1, j-1))
        if action == Action.equal or action == Action.replace:
            i -= 1
            j -= 1
        elif action == Action.insert:
            j -= 1
        else:
            i -= 1
    return opcodes[::-1]


def levenshtein(seq1, seq2, with_equal=False):
    """
    How to transform seq1 to seq2
    :param seq1:
    :param seq2:
    :return:
    """
    max_distance = 500

    if seq1 == seq2:
        return [(Action.names[Action.equal], i, i) for i in range(len(seq1))] if with_equal else []

    v0 = numpy.arange(start=0, stop=len(seq2) + 1, step=1, dtype=numpy.uint32)
    v1 = numpy.empty(shape=(len(seq2) + 1,), dtype=numpy.uint32)
    actions = numpy.zeros(shape=(len(seq1) + 1, len(seq2) + 1), dtype=numpy.uint8)
    # default action is equal

    actions[0, :] = Action.insert
    actions[:, 0] = Action.delete

    for i, seq1_i in enumerate(seq1):
        v1[0] = i
        for j, seq2_j in enumerate(seq2):
            if abs(i - j) > max_distance:
                # Optimization
                v1[j + 1] = max_distance + 1
                continue
            cost = 0 if seq1_i == seq2_j else 1
            replace_cost = v0[j] + cost
            insert_cost = v1[j] + 1
            delete_cost = v0[j+1] + 1

            if replace_cost <= insert_cost and replace_cost <= delete_cost:
                v1[j+1] = replace_cost
                if cost:
                    actions[i+1, j+1] = Action.replace
            elif insert_cost <= delete_cost:
                v1[j+1] = insert_cost
                actions[i+1, j+1] = Action.insert
            else:
                v1[j+1] = delete_cost
                actions[i+1, j+1] = Action.delete
        v0, v1 = v1, v0

    return get_opcodes(actions, with_equal=with_equal)


def metrics(init, correct, model_out):
    init, correct, model_out = align3(init, correct, model_out)
    detected_bad_change = 0
    detected_good_change = 0
    misdetection = 0
    undetected = 0

    for init_c, correct_c, model_out_c in zip(init, correct, model_out):
        if init_c == correct_c == model_out_c:
            continue
        elif init_c == correct_c and init_c != model_out_c:
            misdetection += 1
        elif init_c == model_out_c and init_c != correct_c:
            undetected += 1
        elif correct_c == model_out_c and init_c != correct_c:
            detected_good_change += 1
        else:
            detected_bad_change += 1

    return misdetection, undetected, detected_bad_change, detected_good_change
	class Action:
	equal = 0
	replace = 1
	insert = 2
	delete = 3


	Action.names = {i: name for name, i in Action.__dict__.items() if name[0] != "_"}


	def align3(seq1, seq2, seq3):
	aseq1, aseq2 = align(seq1, seq2)
	bseq2, bseq3 = align(seq2, seq3)
	cseq1, cseq3 = align(aseq1, bseq3)
	dseq2, dseq3 = align(bseq2, cseq3)
	return cseq1, dseq2, dseq3


	def align(seq1, seq2):
	for start, s in enumerate(zip(seq1, seq2)):
	if s[0] != s[1]:
	break
	for end, s in enumerate(zip(seq1[::-1], seq2[::-1])):
	if s[0] != s[1]:
	break

	seq1_mid = seq1[start:end]
	seq2_mid = seq2[start:end]

	op_codes = levenshtein(seq1_mid, seq2_mid, with_equal=True)
	res1 = []
	res2 = []
	for action, i, j in op_codes:
	if action == "equal" or action == "replace":
	res1.append(seq1_mid[i])
	res2.append(seq2_mid[j])
	if action == "insert":
	res1.append(EMPTY)
	res2.append(seq2_mid[j])
	if action == "delete":
	res1.append(seq1_mid[i])
	res2.append(EMPTY)

	return seq1[:start] + "".join(res1) + seq1[end:], seq2[:start] + "".join(res2) + seq2[end:]


	def get_opcodes(actions: numpy.ndarray, with_equal):
	"""
	Build the best path opcodes for given a actions matrix.
	"""
	i = actions.shape[0] - 1
	j = actions.shape[1] - 1
	opcodes = []
	# from matplotlib import pyplot as plt
	# actions2 = actions / 255.0
	# actions2[actions2 == 1] = float("nan")
	# plt.matshow(actions2)
	# plt.show()
	while i != 0 or j != 0:
	action = actions[i][j]
	if with_equal or action != Action.equal:
	opcodes.append((Action.names[action], i-1, j-1))
	if action == Action.equal or action == Action.replace:
	i -= 1
	j -= 1
	elif action == Action.insert:
	j -= 1
	else:
	i -= 1
	return opcodes[::-1]


	def levenshtein(seq1, seq2, with_equal=False):
	"""
	How to transform seq1 to seq2
	:param seq1:
	:param seq2:
	:return:
	"""
	max_distance = 500

	if seq1 == seq2:
	return [(Action.names[Action.equal], i, i) for i in range(len(seq1))] if with_equal else []

	v0 = numpy.arange(start=0, stop=len(seq2) + 1, step=1, dtype=numpy.uint32)
	v1 = numpy.empty(shape=(len(seq2) + 1,), dtype=numpy.uint32)
	actions = numpy.zeros(shape=(len(seq1) + 1, len(seq2) + 1), dtype=numpy.uint8)
	# default action is equal

	actions[0, :] = Action.insert
	actions[:, 0] = Action.delete

	for i, seq1_i in enumerate(seq1):
	v1[0] = i
	for j, seq2_j in enumerate(seq2):
	if abs(i - j) > max_distance:
	# Optimization
	v1[j + 1] = max_distance + 1
	continue
	cost = 0 if seq1_i == seq2_j else 1
	replace_cost = v0[j] + cost
	insert_cost = v1[j] + 1
	delete_cost = v0[j+1] + 1

	if replace_cost <= insert_cost and replace_cost <= delete_cost:
	v1[j+1] = replace_cost
	if cost:
	actions[i+1, j+1] = Action.replace
	elif insert_cost <= delete_cost:
	v1[j+1] = insert_cost
	actions[i+1, j+1] = Action.insert
	else:
	v1[j+1] = delete_cost
	actions[i+1, j+1] = Action.delete
	v0, v1 = v1, v0

	return get_opcodes(actions, with_equal=with_equal)


	def metrics(init, correct, model_out):
	init, correct, model_out = align3(init, correct, model_out)
	detected_bad_change = 0
	detected_good_change = 0
	misdetection = 0
	undetected = 0

	for init_c, correct_c, model_out_c in zip(init, correct, model_out):
	if init_c == correct_c == model_out_c:
	continue
	elif init_c == correct_c and init_c != model_out_c:
	misdetection += 1
	elif init_c == model_out_c and init_c != correct_c:
	undetected += 1
	elif correct_c == model_out_c and init_c != correct_c:
	detected_good_change += 1
	else:
	detected_bad_change += 1

	return misdetection, undetected, detected_bad_change, detected_good_change