Skip to content

Instantly share code, notes, and snippets.

@LinguList
Created July 11, 2019 16:40
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save LinguList/8189f06f231909fbf1d1eed30998bd83 to your computer and use it in GitHub Desktop.
Save LinguList/8189f06f231909fbf1d1eed30998bd83 to your computer and use it in GitHub Desktop.
Waterman-Eggert Illustration and Patch for LingPy

Waterman-Eggert algorithm for Sentence Alignment

This is a short patch for LingPy's Waterman-Eggert implementation and an illustration how the algorithm can be used to carry out the alignment of two sentences provided in phonetic transcription in linguistics. To test this script, make sure to install LingPy and run the following in your terminal:

$ python code.py
from lingpy import *
from patch_we_align import we_align
from itertools import product
def salign(
senA,
senB,
gap=-1,
model='sca',
limit=2
):
"""Align and score two sentences"""
# retrieve sound class model
if not hasattr('scorer', model):
model = Model(model)
# convert to sound classes
tokA, tokB = list(map(
lambda x: ipa2tokens(
x.replace(' ', '_')),
[senA, senB]))
# assume segmentation by underscore
clsA, clsB = list(map(
lambda x: tokens2class(
x,
model
),
[tokA, tokB]
))
scorer = {(a, b): -1 if \
(a != b or '_' in (a, b)) \
else 1 for a, b in product(
set(clsA+clsB),
set(clsA+clsB))
}
# make sure boundaries don't score
scorer['_', '_'] = 0
out, scores = [], []
for a, b, score, iA, iB in we_align(
clsA,
clsB,
scorer=scorer,
gap=gap
):
if len(a) < limit:
pass
else:
out += [
(a, b, score)
]
scores += [score]
# compute self-scores
sA = sum([scorer[a, a] for a in clsA])
sB = sum([scorer[a, a] for a in clsB])
sAB = sum(scores)
score = 1 - (2 * sAB / (sA + sB))
return out, score
if __name__ == '__main__':
alms, scores = salign(
'mainəʁuː.ɪsthɪnsainhɛrt͡s',
'hɛrt͡smainʁuː.əsain',
gap=-2
)
for a, b, score in alms:
print(' '.join(a))
print(' '.join(b))
print('{0:.2f}'.format(score))
print('{0:.2f}'.format(scores))
def we_align(
seqA,
seqB,
scorer,
gap
):
"""
Align two sequences using the Waterman-Eggert algorithm.
Parameters
----------
seqA, seqB : list
The input sequences passed as a list.
scorer : dict
A dictionary containing tuples of two segments as key and numbers as
values.
gap : int
The gap penalty.
Notes
-----
This function is a very straightforward implementation of the
Waterman-Eggert algorithm (:evobib:`Waterman1987`). We recommend to use
the function if you want to test your own scoring dictionaries and profit
from a fast implementation (as we use Cython, the implementation is indeed
faster than pure Python implementations, as long as you use Python 3 and
have Cython installed). If you want to test the WE algorithm without
specifying a scoring dictionary, we recommend to have a look at our wrapper
function with the same name in the :py:class:`~lingpy.align.pairwise`
module.
Returns
-------
alignments : list
A list consisting of tuples. Each tuple gives the alignment of one of
the subsequences of the input sequences. Each tuple contains the
aligned part of the first, the aligned part of the second sequence, and
the score of the alignment.
"""
# basic defs
# get the lengths of the strings
lenA = len(seqA)
lenB = len(seqB)
# define values for the main loop
null = 0 # constant during the loop
# define values for the traceback
igap = 0
jgap = 0
gap_char = '-' # the gap character
# create a tracer for positions in the matrix
tracer = [0 for i in range(lenA+1)]
# create matrix and traceback
matrix = [[0 for i in range(lenA+1)] for j in range(lenB+1)]
traceback = [[0 for i in range(lenA+1)] for j in range(lenB+1)]
# start the main loop
for i in range(1,lenB+1):
# add zero to the tracer
tracer.append(0)
for j in range(1,lenA+1):
# get the penalty
penalty = scorer[seqA[j-1],seqB[i-1]]
# get the three scores
gapA = matrix[i-1][j] + gap
gapB = matrix[i][j-1] + gap
match = matrix[i-1][j-1] + penalty
# evaluate the scores
if gapA >= match and gapA >= gapB and gapA >= null:
matrix[i][j] = gapA
traceback[i][j] = 3
elif match >= gapB and match >= null:
matrix[i][j] = match
traceback[i][j] = 1
elif gapB >= null:
matrix[i][j] = gapB
traceback[i][j] = 2
else:
matrix[i][j] = null
traceback[i][j] = null
# assign the value to the tracer
tracer.append(matrix[i][j])
# make list of alignments
out, visited = [], []
# start the while loop
while True:
# get the maximal value
max_score = max(tracer)
# if max_val is zero, break
if max_score == 0:
break
# get the index of the maximal value of the matrix
idx = max([i for i in range(len(tracer)) if tracer[i] == max_score])
# convert to matrix coordinates
i, j = idx // (lenA+1),idx - (idx // (lenA+1)) * (lenA+1)
# store in imax and jmax
imax, jmax = i, j
sim = matrix[i][j]
# start the traceback
igap, jgap = 0,0
# make values for almA and almB
almA = [s for s in seqA]
almB = [s for s in seqB]
while traceback[i][j] != 0:
if traceback[i][j] == 3:
almA.insert(j,gap_char)
#tracer[i * (lenA+1) + j] = 0 # set tracer to zero
i -= 1
jgap += 1
elif traceback[i][j] == 1:
#tracer[i * (lenA+1) + j] = 0 # set tracer to zero
i -= 1
j -= 1
elif traceback[i][j] == 2:
almB.insert(i,gap_char)
#tracer[i * (lenA+1) + j] = 0 # set tracer to zero
j -= 1
igap += 1
else:
break
# store values
imin, jmin = i, j
# change values to 0 in the tracer
for i in range(1, lenB+1):
for j in range(1, lenA+1):
if imin < i <= imax or jmin < j <= jmax:
tracer[i * (lenA+1) + j] = 0
traceback[i][j] = 0
matrix[i][j] = -1
visited += [(i, j)]
# recompute the matrix
elif i > imax and j > jmax and (i, j) not in visited:
# get the three scores
gapA = matrix[i-1][j] + gap
gapB = matrix[i][j-1] + gap
match = matrix[i-1][j-1] + penalty
# evaluate the scores
if gapA >= match and gapA >= gapB and gapA >= null:
matrix[i][j] = gapA
traceback[i][j] = 3
elif match >= gapB and match >= null:
matrix[i][j] = match
traceback[i][j] = 1
elif gapB >= null:
matrix[i][j] = gapB
traceback[i][j] = 2
else:
matrix[i][j] = null
traceback[i][j] = null
tracer[i * (lenA+1) + j] = matrix[i][j]
# retrieve the aligned parts of the sequences
out.append(
(
almA[jmin:jmax+jgap],
almB[imin:imax+igap],
sim,
(jmin, jmax+jgap),
(imin, imax+igap)
)
)
# return the alignment as a tuple of prefix, alignment, and suffix
return out
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment