Skip to content

Instantly share code, notes, and snippets.

@teh
Created January 18, 2016 12:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save teh/dac63b8260b106f7f18e to your computer and use it in GitHub Desktop.
Save teh/dac63b8260b106f7f18e to your computer and use it in GitHub Desktop.
wagner-fisher in numba
import numpy
import timeit
import numba
@numba.autojit(locals=dict(
best_k=numba.int64,
pos=numba.int64,
ins_del_count=numba.int64,
i=numba.int64,
j=numba.int64,
))
def wagner_fisher(a, b, d):
"""
:param a: needle
:param b: string to search for the needle
:param d: a distance matrix of size at least (len(a), len(b)).
initialization not required.
:returns: start-offset, end-offset, best_k within b
"""
# a == needle
# b == data
len_a = len(a)
len_b = len(b)
for i in range(len_a + 1):
d[i, 0] = i
for j in range(len_b + 1):
d[0, j] = j
d[0, j] = 0
# remember the best match we found
best_k = 2**20
pos = -1
for j in range(1, len_b + 1):
for i in range(1, len_a + 1):
x0 = d[i - 1, j -1] + (0 if a[i - 1] == b[j - 1] else 1)
x1 = d[i -1, j] + 1
x2 = d[i, j - 1] + 1
# optimized version of d[i, j] = min(x0, x1, x2)
if x0 < x1 and x0 < x2:
d[i, j] = x0
elif x1 < x2:
d[i, j] = x1
else:
d[i, j] = x2
# Use <= to find the longest match
if d[len_a, j] < best_k:
best_k = d[len_a, j]
pos = j
# Discover number of insertions/deletions/substitutions by
# backtracking. Need those to determine the correct offset of the
# found needle (e.g. insertions push the end back).
i, j = len_a, pos
ins_del_count = 0
while i > 0:
if d[i - 1, j] < d[i, j]:
i -= 1
ins_del_count -= 1
elif d[i, j - 1] < d[i, j]:
j -= 1
ins_del_count += 1
else:
i -= 1
j -= 1
return pos - len_a - ins_del_count, pos, best_k
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment