Created
April 18, 2018 17:15
-
-
Save thien/941444a085266bed3b32f978c27d9fba to your computer and use it in GitHub Desktop.
needleman-wunch and smith-waterman algorithms
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from termcolor import * | |
# simplicity values | |
def sub(a,b): | |
# substitute | |
if a == b: | |
return 0 | |
else: | |
return 1 | |
def rem(v_j): return 1 # remove | |
def ins(u_i): return 1 # insert | |
def generateTable(a,b): | |
dimension_a = len(a) | |
dimension_b = len(b) | |
# create container for the results (everything is 0 for now.) | |
return [[-1 for _ in range(dimension_b)] for _ in range(dimension_a)] | |
def printTable(t): | |
for i in t: print(i) | |
def smith_waterman(a,b): | |
# internal functs for this particular scenario | |
def sub(a,b): | |
if a == b: return 1 | |
else: return -3 | |
def ins(c): return rem(c) | |
def rem(d): return -1 | |
a = " " + a | |
b = " " + b | |
t = generateTable(a,b) | |
# the first entry is 0. | |
t[0][0] = 0 | |
# process the first column | |
for j in range(len(a)): | |
t[j][0] = 0 | |
# process the first row | |
for i in range(len(b)): | |
t[0][i] = 0 | |
# maximise for each position in coord | |
for j in range(1,len(a)): | |
for i in range(1,len(b)): | |
x = t[j-1][i-1] + sub(a[j], b[i]) | |
y = t[j][i-1] + rem(a[j]) | |
z = t[j-1][i] + ins(b[i]) | |
t[j][i] = max([0,x,y,z]) | |
return t | |
def readSmithWaterman(a,b,t): | |
# internal functs for this particular scenario | |
def sub(a,b): | |
if a == b: return 1 | |
else: return -3 | |
def ins(c): return rem(c) | |
def rem(d): return -1 | |
# need to find the node in the matrix with the highest cost. | |
m = 0 | |
pos = (0,0) | |
for j in range(len(t)): | |
for i in range(len(t[0])): | |
if t[j][i] > m: | |
m = t[j][i] | |
pos = (j,i) | |
print("Max Pos:", pos) | |
# trace back from this node through the proceeding nodes with | |
# the greatest costs | |
net = [pos] | |
diagonal, up, side = [],[],[] | |
a_match = [] | |
b_match = [] | |
while len(net) > 0: | |
pos = net.pop(0) | |
# print("current position:",pos) | |
j,i = pos | |
x = t[j-1][i] + rem(a[j-1]) #up | |
y = t[j][i-1] + ins(b[i-1]) #side | |
z = t[j-1][i-1] + sub(a[j-1], b[i-1]) # diagonal | |
k = t[j][i] | |
if k == x: | |
newPos = (j-1,i) | |
# print("match x") | |
net.append(newPos) | |
side.append(newPos) | |
a_match.append(a[j-1]) | |
b_match.append("-") | |
elif k == y: | |
newPos = (j,i-1) | |
# print("match y") | |
net.append(newPos) | |
up.append(newPos) | |
a_match.append("-") | |
b_match.append(b[i-1]) | |
elif k == z: | |
newPos = (j-1, i-1) | |
# print("match z") | |
net.append(newPos) | |
diagonal.append(newPos) | |
a_match.append(a[j-1]) | |
b_match.append(b[i-1]) | |
a_match, b_match = a_match[::-1], b_match[::-1] | |
print("A Match:", a_match) | |
print("B Match:", b_match) | |
# compile results and print new table | |
pathways = diagonal + up + side | |
print("Diagonal:", diagonal) | |
print("Up:",up) | |
print("Side:", side) | |
for j in range(len(t)): | |
for i in range(len(t[j])): | |
endline = " " | |
pos = (j,i) | |
if i == len(t[j])-1: endline = "\n" | |
if pos in pathways: | |
if pos in diagonal: | |
if pos in side: | |
cprint(t[j][i], 'blue', end=endline) | |
else: | |
cprint(t[j][i], 'red', end=endline) | |
elif pos in up: | |
cprint(t[j][i], 'yellow', end=endline) | |
elif pos in side: | |
cprint(t[j][i], 'green', end=endline) | |
else: | |
print(t[j][i], end=endline) | |
def needleman_wunch(a,b): | |
# check if the tings are the same | |
a,b = " " + a, " " + b | |
# scrape the dimension otherwise | |
dimension = len(a) | |
# create container for the results | |
t = [[-1 for _ in range(dimension)] for _ in range(dimension)] | |
""" | |
Intuition: | |
T[i,j] | |
represents the total cost up to | |
the ith element of a | |
and jth element of b | |
Note: You need a cost for each edit operation: | |
sub(x,y) = sub(y,x) | |
sub(x,x) = 0 | |
rem(v_j) : delete letter v_j at position j from s2 [also known as del in the slides] | |
ins(u_i) : insert letter u_i at position i in s1 | |
For simplicity: sub(x,y) = rem(v_j) = ins(u_i) = 1 | |
""" | |
# the first entry is 0. | |
t[0][0] = 0 | |
# process the first column | |
for j in range(dimension): | |
t[j][0] = t[j-1][0] + rem(a[j-1]) | |
# process the first row | |
for i in range(dimension): | |
t[0][i] = t[0][i-1] + ins(b[i-1]) | |
""" | |
Perform Needleman-Wunch | |
Intuition: | |
T[i,j] is the minimum of the following options: | |
1. T[i-1, j-1] + sub(u_i, v_j) | |
2. T[i-1, j] + del(u_i) | |
3. T[i, j-1] + ins(v_j) | |
""" | |
for i in range(1,dimension): | |
for j in range(1,dimension): | |
x = t[j-1][i-1] + sub(a[j], b[i]) | |
y = t[j-1][i] + rem(a[j]) | |
z = t[j][i-1] + ins(b[i]) | |
t[j][i] = min([x,y,z]) | |
# print("Edit Table:") | |
# printTable(t) | |
return t | |
def readAlignmentStrings(a,b,t): | |
i,j = len(t)-1, len(t[0])-1 | |
net = [(i,j)] | |
diagonal = [(0,0),(i,j)] | |
up = [] | |
side = [] | |
while len(net) > 0: | |
i,j = net.pop(0) | |
pos = (i,j) | |
x = t[i-1][j] + rem(a[i-1]) #side | |
y = t[i][j-1] + ins(b[j-1]) #up | |
z = t[i-1][j-1] + sub(a[i-1],b[j-1]) #diag | |
if t[i][j] == x: | |
newPos = (j,i-1) | |
net.append(newPos) | |
side.append(newPos) | |
if t[i][j] == y: | |
newPos = (j-1, i) | |
net.append(newPos) | |
up.append(newPos) | |
if t[i][j] == z: | |
newPos = (i-1, j-1) | |
net.append(newPos) | |
diagonal.append(newPos) | |
pathways = diagonal + up + side | |
for j in range(len(t)): | |
for i in range(len(t[j])): | |
endline = " " | |
pos = (i,j) | |
if i == len(t[j])-1: endline = "\n" | |
if pos in pathways: | |
if pos in diagonal: | |
if pos in side: | |
cprint(t[j][i], 'blue', end=endline) | |
else: | |
cprint(t[j][i], 'red', end=endline) | |
elif pos in up: | |
cprint(t[j][i], 'yellow', end=endline) | |
elif pos in side: | |
cprint(t[j][i], 'green', end=endline) | |
else: | |
print(t[j][i], end=endline) | |
if __name__ == "__main__": | |
# a = "ATCCGAT" | |
# b = "TGCATAT" | |
# c = needleman_wunch(b,a) | |
# readAlignmentStrings(a,b,c) | |
a = "EAWACQGKL" | |
b = "ERDAWCQPGKWY" | |
c = smith_waterman(a,b) | |
readSmithWaterman(a,b,c) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment