Skip to content

Instantly share code, notes, and snippets.

@thien
Created April 18, 2018 17:15
Show Gist options
  • Save thien/941444a085266bed3b32f978c27d9fba to your computer and use it in GitHub Desktop.
Save thien/941444a085266bed3b32f978c27d9fba to your computer and use it in GitHub Desktop.
needleman-wunch and smith-waterman algorithms
from termcolor import *
# simplicity values
def sub(a,b):
# substitute
if a == b:
return 0
else:
return 1
def rem(v_j): return 1 # remove
def ins(u_i): return 1 # insert
def generateTable(a,b):
dimension_a = len(a)
dimension_b = len(b)
# create container for the results (everything is 0 for now.)
return [[-1 for _ in range(dimension_b)] for _ in range(dimension_a)]
def printTable(t):
for i in t: print(i)
def smith_waterman(a,b):
# internal functs for this particular scenario
def sub(a,b):
if a == b: return 1
else: return -3
def ins(c): return rem(c)
def rem(d): return -1
a = " " + a
b = " " + b
t = generateTable(a,b)
# the first entry is 0.
t[0][0] = 0
# process the first column
for j in range(len(a)):
t[j][0] = 0
# process the first row
for i in range(len(b)):
t[0][i] = 0
# maximise for each position in coord
for j in range(1,len(a)):
for i in range(1,len(b)):
x = t[j-1][i-1] + sub(a[j], b[i])
y = t[j][i-1] + rem(a[j])
z = t[j-1][i] + ins(b[i])
t[j][i] = max([0,x,y,z])
return t
def readSmithWaterman(a,b,t):
# internal functs for this particular scenario
def sub(a,b):
if a == b: return 1
else: return -3
def ins(c): return rem(c)
def rem(d): return -1
# need to find the node in the matrix with the highest cost.
m = 0
pos = (0,0)
for j in range(len(t)):
for i in range(len(t[0])):
if t[j][i] > m:
m = t[j][i]
pos = (j,i)
print("Max Pos:", pos)
# trace back from this node through the proceeding nodes with
# the greatest costs
net = [pos]
diagonal, up, side = [],[],[]
a_match = []
b_match = []
while len(net) > 0:
pos = net.pop(0)
# print("current position:",pos)
j,i = pos
x = t[j-1][i] + rem(a[j-1]) #up
y = t[j][i-1] + ins(b[i-1]) #side
z = t[j-1][i-1] + sub(a[j-1], b[i-1]) # diagonal
k = t[j][i]
if k == x:
newPos = (j-1,i)
# print("match x")
net.append(newPos)
side.append(newPos)
a_match.append(a[j-1])
b_match.append("-")
elif k == y:
newPos = (j,i-1)
# print("match y")
net.append(newPos)
up.append(newPos)
a_match.append("-")
b_match.append(b[i-1])
elif k == z:
newPos = (j-1, i-1)
# print("match z")
net.append(newPos)
diagonal.append(newPos)
a_match.append(a[j-1])
b_match.append(b[i-1])
a_match, b_match = a_match[::-1], b_match[::-1]
print("A Match:", a_match)
print("B Match:", b_match)
# compile results and print new table
pathways = diagonal + up + side
print("Diagonal:", diagonal)
print("Up:",up)
print("Side:", side)
for j in range(len(t)):
for i in range(len(t[j])):
endline = " "
pos = (j,i)
if i == len(t[j])-1: endline = "\n"
if pos in pathways:
if pos in diagonal:
if pos in side:
cprint(t[j][i], 'blue', end=endline)
else:
cprint(t[j][i], 'red', end=endline)
elif pos in up:
cprint(t[j][i], 'yellow', end=endline)
elif pos in side:
cprint(t[j][i], 'green', end=endline)
else:
print(t[j][i], end=endline)
def needleman_wunch(a,b):
# check if the tings are the same
a,b = " " + a, " " + b
# scrape the dimension otherwise
dimension = len(a)
# create container for the results
t = [[-1 for _ in range(dimension)] for _ in range(dimension)]
"""
Intuition:
T[i,j]
represents the total cost up to
the ith element of a
and jth element of b
Note: You need a cost for each edit operation:
sub(x,y) = sub(y,x)
sub(x,x) = 0
rem(v_j) : delete letter v_j at position j from s2 [also known as del in the slides]
ins(u_i) : insert letter u_i at position i in s1
For simplicity: sub(x,y) = rem(v_j) = ins(u_i) = 1
"""
# the first entry is 0.
t[0][0] = 0
# process the first column
for j in range(dimension):
t[j][0] = t[j-1][0] + rem(a[j-1])
# process the first row
for i in range(dimension):
t[0][i] = t[0][i-1] + ins(b[i-1])
"""
Perform Needleman-Wunch
Intuition:
T[i,j] is the minimum of the following options:
1. T[i-1, j-1] + sub(u_i, v_j)
2. T[i-1, j] + del(u_i)
3. T[i, j-1] + ins(v_j)
"""
for i in range(1,dimension):
for j in range(1,dimension):
x = t[j-1][i-1] + sub(a[j], b[i])
y = t[j-1][i] + rem(a[j])
z = t[j][i-1] + ins(b[i])
t[j][i] = min([x,y,z])
# print("Edit Table:")
# printTable(t)
return t
def readAlignmentStrings(a,b,t):
i,j = len(t)-1, len(t[0])-1
net = [(i,j)]
diagonal = [(0,0),(i,j)]
up = []
side = []
while len(net) > 0:
i,j = net.pop(0)
pos = (i,j)
x = t[i-1][j] + rem(a[i-1]) #side
y = t[i][j-1] + ins(b[j-1]) #up
z = t[i-1][j-1] + sub(a[i-1],b[j-1]) #diag
if t[i][j] == x:
newPos = (j,i-1)
net.append(newPos)
side.append(newPos)
if t[i][j] == y:
newPos = (j-1, i)
net.append(newPos)
up.append(newPos)
if t[i][j] == z:
newPos = (i-1, j-1)
net.append(newPos)
diagonal.append(newPos)
pathways = diagonal + up + side
for j in range(len(t)):
for i in range(len(t[j])):
endline = " "
pos = (i,j)
if i == len(t[j])-1: endline = "\n"
if pos in pathways:
if pos in diagonal:
if pos in side:
cprint(t[j][i], 'blue', end=endline)
else:
cprint(t[j][i], 'red', end=endline)
elif pos in up:
cprint(t[j][i], 'yellow', end=endline)
elif pos in side:
cprint(t[j][i], 'green', end=endline)
else:
print(t[j][i], end=endline)
if __name__ == "__main__":
# a = "ATCCGAT"
# b = "TGCATAT"
# c = needleman_wunch(b,a)
# readAlignmentStrings(a,b,c)
a = "EAWACQGKL"
b = "ERDAWCQPGKWY"
c = smith_waterman(a,b)
readSmithWaterman(a,b,c)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment