thien/seq_alignment.py

## seq_alignment.py
from termcolor import *


# simplicity values
def sub(a,b):
    # substitute
    if a == b:
        return 0
    else:
        return 1
def rem(v_j): return 1 # remove
def ins(u_i): return 1 # insert


def generateTable(a,b):
    dimension_a = len(a)
    dimension_b = len(b)
    # create container for the results (everything is 0 for now.)
    return [[-1 for _ in range(dimension_b)] for _ in range(dimension_a)]

def printTable(t):
    for i in t: print(i)

def smith_waterman(a,b):
    # internal functs for this particular scenario
    def sub(a,b):
        if a == b: return 1
        else: return -3
    def ins(c): return rem(c)
    def rem(d): return -1
    a = " " + a
    b = " " + b
    t = generateTable(a,b)

    # the first entry is 0.
    t[0][0] = 0
    # process the first column
    for j in range(len(a)):
        t[j][0] = 0
    # process the first row
    for i in range(len(b)):
        t[0][i] = 0

    # maximise for each position in coord
    for j in range(1,len(a)):
        for i in range(1,len(b)):
            x = t[j-1][i-1] + sub(a[j], b[i])
            y = t[j][i-1] + rem(a[j])
            z = t[j-1][i] + ins(b[i])
            t[j][i] = max([0,x,y,z])
    return t

def readSmithWaterman(a,b,t):
    # internal functs for this particular scenario
    def sub(a,b):
        if a == b: return 1
        else: return -3
    def ins(c): return rem(c)
    def rem(d): return -1

    # need to find the node in the matrix with the highest cost.
    m = 0
    pos = (0,0)
    for j in range(len(t)):
        for i in range(len(t[0])):
            if t[j][i] > m:
                m = t[j][i]
                pos = (j,i)
    print("Max Pos:", pos)
    # trace back from this node through the proceeding nodes with
    # the greatest costs
    net = [pos]
    diagonal, up, side = [],[],[]
    a_match = []
    b_match = []
    while len(net) > 0:
        pos = net.pop(0)
        # print("current position:",pos)
        j,i = pos
        x = t[j-1][i] + rem(a[j-1]) #up
        y = t[j][i-1] + ins(b[i-1]) #side
        z = t[j-1][i-1] + sub(a[j-1], b[i-1]) # diagonal
        k = t[j][i]
        if k == x:
            newPos = (j-1,i)
            # print("match x")
            net.append(newPos)
            side.append(newPos)
            a_match.append(a[j-1])
            b_match.append("-")
        elif k == y:
            newPos = (j,i-1)
            # print("match y")
            net.append(newPos)
            up.append(newPos)
            a_match.append("-")
            b_match.append(b[i-1])
        elif k == z:
            newPos = (j-1, i-1)
            # print("match z")
            net.append(newPos)
            diagonal.append(newPos)
            a_match.append(a[j-1])
            b_match.append(b[i-1])

    a_match, b_match = a_match[::-1], b_match[::-1]
    print("A Match:", a_match)
    print("B Match:", b_match)

    # compile results and print new table
    pathways = diagonal + up + side
    print("Diagonal:", diagonal)
    print("Up:",up)
    print("Side:", side)
    for j in range(len(t)):
        for i in range(len(t[j])):
            endline = " "
            pos = (j,i)
            if i == len(t[j])-1: endline = "\n"
            if pos in pathways:
                if pos in diagonal:
                    if pos in side:
                        cprint(t[j][i], 'blue', end=endline)
                    else:
                        cprint(t[j][i], 'red', end=endline)
                elif pos in up:
                    cprint(t[j][i], 'yellow', end=endline)
                elif pos in side:
                    cprint(t[j][i], 'green', end=endline)
            else:
                print(t[j][i], end=endline)


def needleman_wunch(a,b):
    # check if the tings are the same
    a,b = " " + a, " " + b
    # scrape the dimension otherwise
    dimension = len(a)

    # create container for the results
    t = [[-1 for _ in range(dimension)] for _ in range(dimension)]

    """
    Intuition:
    T[i,j]
    represents the total cost up to
        the ith element of a
        and jth element of b

    Note: You need a cost for each edit operation:
    sub(x,y) = sub(y,x)
    sub(x,x) = 0
    rem(v_j) : delete letter v_j at position j from s2 [also known as del in the slides]
    ins(u_i) : insert letter u_i at position i in s1

    For simplicity: sub(x,y) = rem(v_j) = ins(u_i) = 1
    """

    # the first entry is 0.
    t[0][0] = 0
    # process the first column
    for j in range(dimension):
        t[j][0] = t[j-1][0] + rem(a[j-1])
    # process the first row
    for i in range(dimension):
        t[0][i] = t[0][i-1] + ins(b[i-1])

    """
    Perform Needleman-Wunch

    Intuition:
    T[i,j] is the minimum of the following options:
    1. T[i-1, j-1] + sub(u_i, v_j)
    2. T[i-1, j] + del(u_i)
    3. T[i, j-1] + ins(v_j)
    """

    for i in range(1,dimension):
        for j in range(1,dimension):
            x = t[j-1][i-1] + sub(a[j], b[i])
            y = t[j-1][i] + rem(a[j])
            z = t[j][i-1] + ins(b[i])
            t[j][i] = min([x,y,z])


    # print("Edit Table:")
    # printTable(t)
    return t

def readAlignmentStrings(a,b,t):
    i,j = len(t)-1, len(t[0])-1

    net = [(i,j)]
    diagonal = [(0,0),(i,j)]
    up = []
    side = []


    while len(net) > 0:
        i,j = net.pop(0)
        pos = (i,j)
        x = t[i-1][j] + rem(a[i-1]) #side
        y = t[i][j-1] + ins(b[j-1]) #up
        z = t[i-1][j-1] + sub(a[i-1],b[j-1]) #diag

        if t[i][j] == x:
            newPos = (j,i-1)
            net.append(newPos)
            side.append(newPos)
        if t[i][j] == y:
            newPos = (j-1, i)
            net.append(newPos)
            up.append(newPos)
        if t[i][j] == z:
            newPos = (i-1, j-1)
            net.append(newPos)
            diagonal.append(newPos)

    pathways = diagonal + up + side

    for j in range(len(t)):
        for i in range(len(t[j])):
            endline = " "
            pos = (i,j)
            if i == len(t[j])-1: endline = "\n"
            if pos in pathways:
                if pos in diagonal:
                    if pos in side:
                        cprint(t[j][i], 'blue', end=endline)
                    else:
                        cprint(t[j][i], 'red', end=endline)
                elif pos in up:
                    cprint(t[j][i], 'yellow', end=endline)
                elif pos in side:
                    cprint(t[j][i], 'green', end=endline)
            else:
                print(t[j][i], end=endline)


if __name__ == "__main__":
    # a = "ATCCGAT"
    # b = "TGCATAT"
    # c = needleman_wunch(b,a)
    # readAlignmentStrings(a,b,c)
    a = "EAWACQGKL"
    b = "ERDAWCQPGKWY"
    c = smith_waterman(a,b)
    readSmithWaterman(a,b,c)
	from termcolor import *


	# simplicity values
	def sub(a,b):
	# substitute
	if a == b:
	return 0
	else:
	return 1
	def rem(v_j): return 1 # remove
	def ins(u_i): return 1 # insert



	def generateTable(a,b):
	dimension_a = len(a)
	dimension_b = len(b)
	# create container for the results (everything is 0 for now.)
	return [[-1 for _ in range(dimension_b)] for _ in range(dimension_a)]

	def printTable(t):
	for i in t: print(i)

	def smith_waterman(a,b):
	# internal functs for this particular scenario
	def sub(a,b):
	if a == b: return 1
	else: return -3
	def ins(c): return rem(c)
	def rem(d): return -1
	a = " " + a
	b = " " + b
	t = generateTable(a,b)

	# the first entry is 0.
	t[0][0] = 0
	# process the first column
	for j in range(len(a)):
	t[j][0] = 0
	# process the first row
	for i in range(len(b)):
	t[0][i] = 0

	# maximise for each position in coord
	for j in range(1,len(a)):
	for i in range(1,len(b)):
	x = t[j-1][i-1] + sub(a[j], b[i])
	y = t[j][i-1] + rem(a[j])
	z = t[j-1][i] + ins(b[i])
	t[j][i] = max([0,x,y,z])
	return t

	def readSmithWaterman(a,b,t):
	# internal functs for this particular scenario
	def sub(a,b):
	if a == b: return 1
	else: return -3
	def ins(c): return rem(c)
	def rem(d): return -1

	# need to find the node in the matrix with the highest cost.
	m = 0
	pos = (0,0)
	for j in range(len(t)):
	for i in range(len(t[0])):
	if t[j][i] > m:
	m = t[j][i]
	pos = (j,i)
	print("Max Pos:", pos)
	# trace back from this node through the proceeding nodes with
	# the greatest costs
	net = [pos]
	diagonal, up, side = [],[],[]
	a_match = []
	b_match = []
	while len(net) > 0:
	pos = net.pop(0)
	# print("current position:",pos)
	j,i = pos
	x = t[j-1][i] + rem(a[j-1]) #up
	y = t[j][i-1] + ins(b[i-1]) #side
	z = t[j-1][i-1] + sub(a[j-1], b[i-1]) # diagonal
	k = t[j][i]
	if k == x:
	newPos = (j-1,i)
	# print("match x")
	net.append(newPos)
	side.append(newPos)
	a_match.append(a[j-1])
	b_match.append("-")
	elif k == y:
	newPos = (j,i-1)
	# print("match y")
	net.append(newPos)
	up.append(newPos)
	a_match.append("-")
	b_match.append(b[i-1])
	elif k == z:
	newPos = (j-1, i-1)
	# print("match z")
	net.append(newPos)
	diagonal.append(newPos)
	a_match.append(a[j-1])
	b_match.append(b[i-1])

	a_match, b_match = a_match[::-1], b_match[::-1]
	print("A Match:", a_match)
	print("B Match:", b_match)

	# compile results and print new table
	pathways = diagonal + up + side
	print("Diagonal:", diagonal)
	print("Up:",up)
	print("Side:", side)
	for j in range(len(t)):
	for i in range(len(t[j])):
	endline = " "
	pos = (j,i)
	if i == len(t[j])-1: endline = "\n"
	if pos in pathways:
	if pos in diagonal:
	if pos in side:
	cprint(t[j][i], 'blue', end=endline)
	else:
	cprint(t[j][i], 'red', end=endline)
	elif pos in up:
	cprint(t[j][i], 'yellow', end=endline)
	elif pos in side:
	cprint(t[j][i], 'green', end=endline)
	else:
	print(t[j][i], end=endline)


	def needleman_wunch(a,b):
	# check if the tings are the same
	a,b = " " + a, " " + b
	# scrape the dimension otherwise
	dimension = len(a)

	# create container for the results
	t = [[-1 for _ in range(dimension)] for _ in range(dimension)]

	"""
	Intuition:
	T[i,j]
	represents the total cost up to
	the ith element of a
	and jth element of b

	Note: You need a cost for each edit operation:
	sub(x,y) = sub(y,x)
	sub(x,x) = 0
	rem(v_j) : delete letter v_j at position j from s2 [also known as del in the slides]
	ins(u_i) : insert letter u_i at position i in s1

	For simplicity: sub(x,y) = rem(v_j) = ins(u_i) = 1
	"""

	# the first entry is 0.
	t[0][0] = 0
	# process the first column
	for j in range(dimension):
	t[j][0] = t[j-1][0] + rem(a[j-1])
	# process the first row
	for i in range(dimension):
	t[0][i] = t[0][i-1] + ins(b[i-1])

	"""
	Perform Needleman-Wunch

	Intuition:
	T[i,j] is the minimum of the following options:
	1. T[i-1, j-1] + sub(u_i, v_j)
	2. T[i-1, j] + del(u_i)
	3. T[i, j-1] + ins(v_j)
	"""

	for i in range(1,dimension):
	for j in range(1,dimension):
	x = t[j-1][i-1] + sub(a[j], b[i])
	y = t[j-1][i] + rem(a[j])
	z = t[j][i-1] + ins(b[i])
	t[j][i] = min([x,y,z])


	# print("Edit Table:")
	# printTable(t)
	return t

	def readAlignmentStrings(a,b,t):
	i,j = len(t)-1, len(t[0])-1

	net = [(i,j)]
	diagonal = [(0,0),(i,j)]
	up = []
	side = []


	while len(net) > 0:
	i,j = net.pop(0)
	pos = (i,j)
	x = t[i-1][j] + rem(a[i-1]) #side
	y = t[i][j-1] + ins(b[j-1]) #up
	z = t[i-1][j-1] + sub(a[i-1],b[j-1]) #diag

	if t[i][j] == x:
	newPos = (j,i-1)
	net.append(newPos)
	side.append(newPos)
	if t[i][j] == y:
	newPos = (j-1, i)
	net.append(newPos)
	up.append(newPos)
	if t[i][j] == z:
	newPos = (i-1, j-1)
	net.append(newPos)
	diagonal.append(newPos)

	pathways = diagonal + up + side

	for j in range(len(t)):
	for i in range(len(t[j])):
	endline = " "
	pos = (i,j)
	if i == len(t[j])-1: endline = "\n"
	if pos in pathways:
	if pos in diagonal:
	if pos in side:
	cprint(t[j][i], 'blue', end=endline)
	else:
	cprint(t[j][i], 'red', end=endline)
	elif pos in up:
	cprint(t[j][i], 'yellow', end=endline)
	elif pos in side:
	cprint(t[j][i], 'green', end=endline)
	else:
	print(t[j][i], end=endline)


	if __name__ == "__main__":
	# a = "ATCCGAT"
	# b = "TGCATAT"
	# c = needleman_wunch(b,a)
	# readAlignmentStrings(a,b,c)
	a = "EAWACQGKL"
	b = "ERDAWCQPGKWY"
	c = smith_waterman(a,b)
	readSmithWaterman(a,b,c)