GiantDarth/hw3.py

## hw3.py
#!/usr/bin/python3
# Copyright (c) 2016 Christopher Philabaum
# CS599 - HW3 Needleman-Wunsch & Smith-Waterman
import random, datetime

class Alignment:
    def __init__(self, a, b, align):
        self._a = a
        self._b = b
        self._align = align

    def __str__(self):
        return self._a + '\n' + self._align + '\n' + self._b

NO_DIRECTION = '•'
UP_ARROW = '↑'
LEFT_ARROW = '←'
DIAGONAL_ARROW = '↖'

def needleman_wunsch(a, b, sim_table, gap_penalty):
    # Initialize grid
    grid = []
    traceback = []
    num_rows = len(a) + 1
    num_cols = len(b) + 1
    for i in range(num_rows):
        grid.append([0] * num_cols)
        traceback.append([''] * num_cols)
    # Set first row based on gaps
    for i in range(num_rows):
        grid[i][0] = i * gap_penalty
    # Set first col based on gaps
    for j in range(num_cols):
        grid[0][j] = j * gap_penalty

    # Set initial traceback
    traceback[0][0] = NO_DIRECTION
    for i in range(1, num_rows):
        traceback[i][0] = UP_ARROW
    for j in range(1, num_cols):
        traceback[0][j] = LEFT_ARROW

    ARROWS = (DIAGONAL_ARROW, UP_ARROW, LEFT_ARROW)
    for i in range(1, num_rows):
        for j in range(1, num_cols):
            # A faster way of getting the max and the max's index without multiple checks
            # Combines ('↖', '↑', '←') & the previous values, produces:
            # (('↖', val of diag), ('↑', val of above), ('←', val of left)
            # and only maxes by the second items.
            # Doesn't use touch memory unnecessarily, and does no additional checks
            traceback[i][j], grid[i][j] = max(
                    zip(ARROWS, (grid[i-1][j-1] + sim_table[a[i-1] + b[j-1]],
                                 grid[i-1][j] + gap_penalty,
                                 grid[i][j-1] + gap_penalty)),
                    key=lambda p: p[1]
            )

    new_a = ''
    new_b = ''
    align = ''
    x = num_rows - 1
    y = num_cols - 1
    while traceback[x][y] != NO_DIRECTION:
        if x > 0 and y > 0 and traceback[x][y] == DIAGONAL_ARROW:
            new_a = a[x-1] + new_a
            new_b = b[y-1] + new_b
            if a[x-1] == b[y-1]:
                align = '|' + align
            else:
                align = 'X' + align
            x -= 1
            y -= 1
        elif x > 0 and traceback[x][y] == UP_ARROW:
            new_a = a[x-1] + new_a
            new_b = '_' + new_b
            align = ' ' + align
            x -= 1
        else:
            new_b = b[y-1] + new_b
            new_a = '_' + new_a
            align = ' ' + align
            y -= 1

    return Alignment(new_a, new_b, align), grid, traceback


def smith_waterman(a, b, sim_table, gap_penalty):
    # Initialize grid
    grid = []
    traceback = []
    num_rows = len(a) + 1
    num_cols = len(b) + 1
    for i in range(num_rows):
        grid.append([0] * num_cols)
        traceback.append([''] * num_cols)

    # Set initial traceback
    traceback[0][0] = NO_DIRECTION
    for i in range(1, num_rows):
        traceback[i][0] = NO_DIRECTION
    for j in range(1, num_cols):
        traceback[0][j] = NO_DIRECTION

    ARROWS = (NO_DIRECTION, DIAGONAL_ARROW, UP_ARROW, LEFT_ARROW)
    for i in range(1, num_rows):
        for j in range(1, num_cols):
            # A faster way of getting the max and the max's index without multiple checks
            # Combines ('↖', '↑', '←') & the previous values, produces:
            # (('↖', val of diag), ('↑', val of above), ('←', val of left)
            # and only maxes by the second items.
            # Doesn't use touch memory unnecessarily, and does no additional checks
            traceback[i][j], grid[i][j] = max(
                    zip(ARROWS, (0,
                                 grid[i-1][j-1] + sim_table[a[i-1] + b[j-1]],
                                 grid[i-1][j] + gap_penalty,
                                 grid[i][j-1] + gap_penalty)),
                    key=lambda p: p[1]
            )

    new_a = ''
    new_b = ''
    align = ''
    x = num_rows - 1
    y = num_cols - 1
    while traceback[x][y] != NO_DIRECTION:
        if x > 0 and y > 0 and traceback[x][y] == DIAGONAL_ARROW:
            new_a = a[x-1] + new_a
            new_b = b[y-1] + new_b
            if a[x-1] == b[y-1]:
                align = '|' + align
            else:
                align = 'X' + align
            x -= 1
            y -= 1
        elif x > 0 and traceback[x][y] == UP_ARROW:
            new_a = a[x-1] + new_a
            new_b = '_' + new_b
            align = ' ' + align
            x -= 1
        else:
            new_b = b[y-1] + new_b
            new_a = '_' + new_a
            align = ' ' + align
            y -= 1

    return Alignment(new_a, new_b, align), grid, traceback


def generate_sequence(mer):
    seq = ''
    for m in range(mer):
        seq += random.choice('ACGT')
    return seq


def generate_sequences(k, mer):
    seqs = []
    for i in range(k):
        seqs.append(generate_sequence(mer))
    return seqs

if __name__ == '__main__':
    sim_table = dict()
    sim_table["AA"] = 1
    sim_table["CC"] = 1
    sim_table["GG"] = 1
    sim_table["TT"] = 1

    sim_table["AC"] = -1
    sim_table["AG"] = -1
    sim_table["AT"] = -1

    sim_table["CA"] = -1
    sim_table["CG"] = -1
    sim_table["CT"] = -1

    sim_table["GA"] = -1
    sim_table["GC"] = -1
    sim_table["GT"] = -1

    sim_table["TA"] = -1
    sim_table["TC"] = -1
    sim_table["TG"] = -1

    initial_k = 1000
    mer_len = 100

    # Test Run
    print("Needleman-Wunsch Test Run: ")
    test_seq = "GCATGCT"
    test_query = "GATTACA"
    print("Seq: ", test_seq)
    print("Qry: ", test_query)
    print()
    test_res = needleman_wunsch(test_seq, test_query, sim_table, -1)
    print(test_res[1])
    print(test_res[2])
    print(test_res[0])
    print()

    print("Smith-Waterman Test Run: ")
    test_seq = "GCATGCT"
    test_query = "GATTACA"
    print("Seq: ", test_seq)
    print("Qry: ", test_query)
    print()
    test_res = smith_waterman(test_seq, test_query, sim_table, -1)
    print(test_res[1])
    print(test_res[2])
    print(test_res[0])
    print()

    # Problem 1 A/B
    print("Problem 1")
    for i in range(4):
        k = initial_k * 10**i
        print("Running {} {}-mer sequences".format(k, mer_len))
        seqs = generate_sequences(k, mer_len)
        query = generate_sequence(mer_len)
        t = datetime.datetime.now()
        for j in range(k):
            needleman_wunsch(seqs[j], query, sim_table, -3)
        t = datetime.datetime.now() - t
        print("{}s".format(t.seconds))
    print("Done")

    # Problem 2 A/B
    print("Problem 2")
    for i in range(4):
        k = initial_k * 10**i
        print("Running {} {}-mer sequences".format(k, mer_len))
        seqs = generate_sequences(k, mer_len)
        query = generate_sequence(mer_len)
        t = datetime.datetime.now()
        for j in range(k):
            smith_waterman(seqs[j], query, sim_table, -3)
        t = datetime.datetime.now() - t
        print("{}s".format(t.seconds))
    print("Done")

## results.txt
Problem 1 (Needleman-Wunsch):
    1k: 14s
    10k: 146s = 2m26s
    100k: Projected 1460s = 24m20s
    1000k: Projected 14600s = 243m20s

Problem 2 (Smith-Waterman):
    1k: 12s
    10k: 125 = 2m5s
    100k: Projected 1250s = 20m50s
    1000k: Projected 12500s = 208m20s
	#!/usr/bin/python3
	# Copyright (c) 2016 Christopher Philabaum
	# CS599 - HW3 Needleman-Wunsch & Smith-Waterman
	import random, datetime

	class Alignment:
	def __init__(self, a, b, align):
	self._a = a
	self._b = b
	self._align = align

	def __str__(self):
	return self._a + '\n' + self._align + '\n' + self._b

	NO_DIRECTION = '•'
	UP_ARROW = '↑'
	LEFT_ARROW = '←'
	DIAGONAL_ARROW = '↖'

	def needleman_wunsch(a, b, sim_table, gap_penalty):
	# Initialize grid
	grid = []
	traceback = []
	num_rows = len(a) + 1
	num_cols = len(b) + 1
	for i in range(num_rows):
	grid.append([0] * num_cols)
	traceback.append([''] * num_cols)
	# Set first row based on gaps
	for i in range(num_rows):
	grid[i][0] = i * gap_penalty
	# Set first col based on gaps
	for j in range(num_cols):
	grid[0][j] = j * gap_penalty

	# Set initial traceback
	traceback[0][0] = NO_DIRECTION
	for i in range(1, num_rows):
	traceback[i][0] = UP_ARROW
	for j in range(1, num_cols):
	traceback[0][j] = LEFT_ARROW

	ARROWS = (DIAGONAL_ARROW, UP_ARROW, LEFT_ARROW)
	for i in range(1, num_rows):
	for j in range(1, num_cols):
	# A faster way of getting the max and the max's index without multiple checks
	# Combines ('↖', '↑', '←') & the previous values, produces:
	# (('↖', val of diag), ('↑', val of above), ('←', val of left)
	# and only maxes by the second items.
	# Doesn't use touch memory unnecessarily, and does no additional checks
	traceback[i][j], grid[i][j] = max(
	zip(ARROWS, (grid[i-1][j-1] + sim_table[a[i-1] + b[j-1]],
	grid[i-1][j] + gap_penalty,
	grid[i][j-1] + gap_penalty)),
	key=lambda p: p[1]
	)

	new_a = ''
	new_b = ''
	align = ''
	x = num_rows - 1
	y = num_cols - 1
	while traceback[x][y] != NO_DIRECTION:
	if x > 0 and y > 0 and traceback[x][y] == DIAGONAL_ARROW:
	new_a = a[x-1] + new_a
	new_b = b[y-1] + new_b
	if a[x-1] == b[y-1]:
	align = '\|' + align
	else:
	align = 'X' + align
	x -= 1
	y -= 1
	elif x > 0 and traceback[x][y] == UP_ARROW:
	new_a = a[x-1] + new_a
	new_b = '_' + new_b
	align = ' ' + align
	x -= 1
	else:
	new_b = b[y-1] + new_b
	new_a = '_' + new_a
	align = ' ' + align
	y -= 1

	return Alignment(new_a, new_b, align), grid, traceback


	def smith_waterman(a, b, sim_table, gap_penalty):
	# Initialize grid
	grid = []
	traceback = []
	num_rows = len(a) + 1
	num_cols = len(b) + 1
	for i in range(num_rows):
	grid.append([0] * num_cols)
	traceback.append([''] * num_cols)

	# Set initial traceback
	traceback[0][0] = NO_DIRECTION
	for i in range(1, num_rows):
	traceback[i][0] = NO_DIRECTION
	for j in range(1, num_cols):
	traceback[0][j] = NO_DIRECTION

	ARROWS = (NO_DIRECTION, DIAGONAL_ARROW, UP_ARROW, LEFT_ARROW)
	for i in range(1, num_rows):
	for j in range(1, num_cols):
	# A faster way of getting the max and the max's index without multiple checks
	# Combines ('↖', '↑', '←') & the previous values, produces:
	# (('↖', val of diag), ('↑', val of above), ('←', val of left)
	# and only maxes by the second items.
	# Doesn't use touch memory unnecessarily, and does no additional checks
	traceback[i][j], grid[i][j] = max(
	zip(ARROWS, (0,
	grid[i-1][j-1] + sim_table[a[i-1] + b[j-1]],
	grid[i-1][j] + gap_penalty,
	grid[i][j-1] + gap_penalty)),
	key=lambda p: p[1]
	)

	new_a = ''
	new_b = ''
	align = ''
	x = num_rows - 1
	y = num_cols - 1
	while traceback[x][y] != NO_DIRECTION:
	if x > 0 and y > 0 and traceback[x][y] == DIAGONAL_ARROW:
	new_a = a[x-1] + new_a
	new_b = b[y-1] + new_b
	if a[x-1] == b[y-1]:
	align = '\|' + align
	else:
	align = 'X' + align
	x -= 1
	y -= 1
	elif x > 0 and traceback[x][y] == UP_ARROW:
	new_a = a[x-1] + new_a
	new_b = '_' + new_b
	align = ' ' + align
	x -= 1
	else:
	new_b = b[y-1] + new_b
	new_a = '_' + new_a
	align = ' ' + align
	y -= 1

	return Alignment(new_a, new_b, align), grid, traceback


	def generate_sequence(mer):
	seq = ''
	for m in range(mer):
	seq += random.choice('ACGT')
	return seq


	def generate_sequences(k, mer):
	seqs = []
	for i in range(k):
	seqs.append(generate_sequence(mer))
	return seqs

	if __name__ == '__main__':
	sim_table = dict()
	sim_table["AA"] = 1
	sim_table["CC"] = 1
	sim_table["GG"] = 1
	sim_table["TT"] = 1

	sim_table["AC"] = -1
	sim_table["AG"] = -1
	sim_table["AT"] = -1

	sim_table["CA"] = -1
	sim_table["CG"] = -1
	sim_table["CT"] = -1

	sim_table["GA"] = -1
	sim_table["GC"] = -1
	sim_table["GT"] = -1

	sim_table["TA"] = -1
	sim_table["TC"] = -1
	sim_table["TG"] = -1

	initial_k = 1000
	mer_len = 100

	# Test Run
	print("Needleman-Wunsch Test Run: ")
	test_seq = "GCATGCT"
	test_query = "GATTACA"
	print("Seq: ", test_seq)
	print("Qry: ", test_query)
	print()
	test_res = needleman_wunsch(test_seq, test_query, sim_table, -1)
	print(test_res[1])
	print(test_res[2])
	print(test_res[0])
	print()

	print("Smith-Waterman Test Run: ")
	test_seq = "GCATGCT"
	test_query = "GATTACA"
	print("Seq: ", test_seq)
	print("Qry: ", test_query)
	print()
	test_res = smith_waterman(test_seq, test_query, sim_table, -1)
	print(test_res[1])
	print(test_res[2])
	print(test_res[0])
	print()

	# Problem 1 A/B
	print("Problem 1")
	for i in range(4):
	k = initial_k * 10**i
	print("Running {} {}-mer sequences".format(k, mer_len))
	seqs = generate_sequences(k, mer_len)
	query = generate_sequence(mer_len)
	t = datetime.datetime.now()
	for j in range(k):
	needleman_wunsch(seqs[j], query, sim_table, -3)
	t = datetime.datetime.now() - t
	print("{}s".format(t.seconds))
	print("Done")

	# Problem 2 A/B
	print("Problem 2")
	for i in range(4):
	k = initial_k * 10**i
	print("Running {} {}-mer sequences".format(k, mer_len))
	seqs = generate_sequences(k, mer_len)
	query = generate_sequence(mer_len)
	t = datetime.datetime.now()
	for j in range(k):
	smith_waterman(seqs[j], query, sim_table, -3)
	t = datetime.datetime.now() - t
	print("{}s".format(t.seconds))
	print("Done")
	Problem 1 (Needleman-Wunsch):
	1k: 14s
	10k: 146s = 2m26s
	100k: Projected 1460s = 24m20s
	1000k: Projected 14600s = 243m20s

	Problem 2 (Smith-Waterman):
	1k: 12s
	10k: 125 = 2m5s
	100k: Projected 1250s = 20m50s
	1000k: Projected 12500s = 208m20s