|
def we_align( |
|
seqA, |
|
seqB, |
|
scorer, |
|
gap |
|
): |
|
""" |
|
Align two sequences using the Waterman-Eggert algorithm. |
|
|
|
Parameters |
|
---------- |
|
seqA, seqB : list |
|
The input sequences passed as a list. |
|
scorer : dict |
|
A dictionary containing tuples of two segments as key and numbers as |
|
values. |
|
gap : int |
|
The gap penalty. |
|
|
|
Notes |
|
----- |
|
This function is a very straightforward implementation of the |
|
Waterman-Eggert algorithm (:evobib:`Waterman1987`). We recommend to use |
|
the function if you want to test your own scoring dictionaries and profit |
|
from a fast implementation (as we use Cython, the implementation is indeed |
|
faster than pure Python implementations, as long as you use Python 3 and |
|
have Cython installed). If you want to test the WE algorithm without |
|
specifying a scoring dictionary, we recommend to have a look at our wrapper |
|
function with the same name in the :py:class:`~lingpy.align.pairwise` |
|
module. |
|
|
|
Returns |
|
------- |
|
alignments : list |
|
A list consisting of tuples. Each tuple gives the alignment of one of |
|
the subsequences of the input sequences. Each tuple contains the |
|
aligned part of the first, the aligned part of the second sequence, and |
|
the score of the alignment. |
|
|
|
""" |
|
# basic defs |
|
|
|
# get the lengths of the strings |
|
lenA = len(seqA) |
|
lenB = len(seqB) |
|
|
|
# define values for the main loop |
|
null = 0 # constant during the loop |
|
|
|
# define values for the traceback |
|
igap = 0 |
|
jgap = 0 |
|
gap_char = '-' # the gap character |
|
|
|
# create a tracer for positions in the matrix |
|
tracer = [0 for i in range(lenA+1)] |
|
|
|
# create matrix and traceback |
|
matrix = [[0 for i in range(lenA+1)] for j in range(lenB+1)] |
|
traceback = [[0 for i in range(lenA+1)] for j in range(lenB+1)] |
|
|
|
# start the main loop |
|
for i in range(1,lenB+1): |
|
|
|
# add zero to the tracer |
|
tracer.append(0) |
|
|
|
for j in range(1,lenA+1): |
|
|
|
# get the penalty |
|
penalty = scorer[seqA[j-1],seqB[i-1]] |
|
|
|
# get the three scores |
|
gapA = matrix[i-1][j] + gap |
|
gapB = matrix[i][j-1] + gap |
|
match = matrix[i-1][j-1] + penalty |
|
|
|
# evaluate the scores |
|
if gapA >= match and gapA >= gapB and gapA >= null: |
|
matrix[i][j] = gapA |
|
traceback[i][j] = 3 |
|
elif match >= gapB and match >= null: |
|
matrix[i][j] = match |
|
traceback[i][j] = 1 |
|
elif gapB >= null: |
|
matrix[i][j] = gapB |
|
traceback[i][j] = 2 |
|
else: |
|
matrix[i][j] = null |
|
traceback[i][j] = null |
|
|
|
# assign the value to the tracer |
|
tracer.append(matrix[i][j]) |
|
|
|
|
|
# make list of alignments |
|
out, visited = [], [] |
|
|
|
# start the while loop |
|
while True: |
|
|
|
# get the maximal value |
|
max_score = max(tracer) |
|
|
|
# if max_val is zero, break |
|
if max_score == 0: |
|
break |
|
|
|
# get the index of the maximal value of the matrix |
|
idx = max([i for i in range(len(tracer)) if tracer[i] == max_score]) |
|
|
|
# convert to matrix coordinates |
|
i, j = idx // (lenA+1),idx - (idx // (lenA+1)) * (lenA+1) |
|
|
|
# store in imax and jmax |
|
imax, jmax = i, j |
|
|
|
sim = matrix[i][j] |
|
|
|
# start the traceback |
|
igap, jgap = 0,0 |
|
|
|
# make values for almA and almB |
|
almA = [s for s in seqA] |
|
almB = [s for s in seqB] |
|
|
|
while traceback[i][j] != 0: |
|
if traceback[i][j] == 3: |
|
almA.insert(j,gap_char) |
|
#tracer[i * (lenA+1) + j] = 0 # set tracer to zero |
|
i -= 1 |
|
jgap += 1 |
|
elif traceback[i][j] == 1: |
|
#tracer[i * (lenA+1) + j] = 0 # set tracer to zero |
|
i -= 1 |
|
j -= 1 |
|
elif traceback[i][j] == 2: |
|
almB.insert(i,gap_char) |
|
#tracer[i * (lenA+1) + j] = 0 # set tracer to zero |
|
j -= 1 |
|
igap += 1 |
|
else: |
|
break |
|
|
|
# store values |
|
imin, jmin = i, j |
|
|
|
# change values to 0 in the tracer |
|
for i in range(1, lenB+1): |
|
for j in range(1, lenA+1): |
|
if imin < i <= imax or jmin < j <= jmax: |
|
tracer[i * (lenA+1) + j] = 0 |
|
traceback[i][j] = 0 |
|
matrix[i][j] = -1 |
|
visited += [(i, j)] |
|
# recompute the matrix |
|
elif i > imax and j > jmax and (i, j) not in visited: |
|
# get the three scores |
|
gapA = matrix[i-1][j] + gap |
|
gapB = matrix[i][j-1] + gap |
|
match = matrix[i-1][j-1] + penalty |
|
|
|
# evaluate the scores |
|
if gapA >= match and gapA >= gapB and gapA >= null: |
|
matrix[i][j] = gapA |
|
traceback[i][j] = 3 |
|
elif match >= gapB and match >= null: |
|
matrix[i][j] = match |
|
traceback[i][j] = 1 |
|
elif gapB >= null: |
|
matrix[i][j] = gapB |
|
traceback[i][j] = 2 |
|
else: |
|
matrix[i][j] = null |
|
traceback[i][j] = null |
|
tracer[i * (lenA+1) + j] = matrix[i][j] |
|
|
|
# retrieve the aligned parts of the sequences |
|
out.append( |
|
( |
|
almA[jmin:jmax+jgap], |
|
almB[imin:imax+igap], |
|
sim, |
|
(jmin, jmax+jgap), |
|
(imin, imax+igap) |
|
) |
|
) |
|
|
|
# return the alignment as a tuple of prefix, alignment, and suffix |
|
return out |