Skip to content

Instantly share code, notes, and snippets.

@GiantDarth
Last active February 27, 2016 05:45
Show Gist options
  • Save GiantDarth/dcc011ac762544a33a2b to your computer and use it in GitHub Desktop.
Save GiantDarth/dcc011ac762544a33a2b to your computer and use it in GitHub Desktop.
Needleman-Wunsch vs Smith-Waterman
#!/usr/bin/python3
# Copyright (c) 2016 Christopher Philabaum
# CS599 - HW3 Needleman-Wunsch & Smith-Waterman
import random, datetime
class Alignment:
def __init__(self, a, b, align):
self._a = a
self._b = b
self._align = align
def __str__(self):
return self._a + '\n' + self._align + '\n' + self._b
NO_DIRECTION = '•'
UP_ARROW = '↑'
LEFT_ARROW = '←'
DIAGONAL_ARROW = '↖'
def needleman_wunsch(a, b, sim_table, gap_penalty):
# Initialize grid
grid = []
traceback = []
num_rows = len(a) + 1
num_cols = len(b) + 1
for i in range(num_rows):
grid.append([0] * num_cols)
traceback.append([''] * num_cols)
# Set first row based on gaps
for i in range(num_rows):
grid[i][0] = i * gap_penalty
# Set first col based on gaps
for j in range(num_cols):
grid[0][j] = j * gap_penalty
# Set initial traceback
traceback[0][0] = NO_DIRECTION
for i in range(1, num_rows):
traceback[i][0] = UP_ARROW
for j in range(1, num_cols):
traceback[0][j] = LEFT_ARROW
ARROWS = (DIAGONAL_ARROW, UP_ARROW, LEFT_ARROW)
for i in range(1, num_rows):
for j in range(1, num_cols):
# A faster way of getting the max and the max's index without multiple checks
# Combines ('↖', '↑', '←') & the previous values, produces:
# (('↖', val of diag), ('↑', val of above), ('←', val of left)
# and only maxes by the second items.
# Doesn't use touch memory unnecessarily, and does no additional checks
traceback[i][j], grid[i][j] = max(
zip(ARROWS, (grid[i-1][j-1] + sim_table[a[i-1] + b[j-1]],
grid[i-1][j] + gap_penalty,
grid[i][j-1] + gap_penalty)),
key=lambda p: p[1]
)
new_a = ''
new_b = ''
align = ''
x = num_rows - 1
y = num_cols - 1
while traceback[x][y] != NO_DIRECTION:
if x > 0 and y > 0 and traceback[x][y] == DIAGONAL_ARROW:
new_a = a[x-1] + new_a
new_b = b[y-1] + new_b
if a[x-1] == b[y-1]:
align = '|' + align
else:
align = 'X' + align
x -= 1
y -= 1
elif x > 0 and traceback[x][y] == UP_ARROW:
new_a = a[x-1] + new_a
new_b = '_' + new_b
align = ' ' + align
x -= 1
else:
new_b = b[y-1] + new_b
new_a = '_' + new_a
align = ' ' + align
y -= 1
return Alignment(new_a, new_b, align), grid, traceback
def smith_waterman(a, b, sim_table, gap_penalty):
# Initialize grid
grid = []
traceback = []
num_rows = len(a) + 1
num_cols = len(b) + 1
for i in range(num_rows):
grid.append([0] * num_cols)
traceback.append([''] * num_cols)
# Set initial traceback
traceback[0][0] = NO_DIRECTION
for i in range(1, num_rows):
traceback[i][0] = NO_DIRECTION
for j in range(1, num_cols):
traceback[0][j] = NO_DIRECTION
ARROWS = (NO_DIRECTION, DIAGONAL_ARROW, UP_ARROW, LEFT_ARROW)
for i in range(1, num_rows):
for j in range(1, num_cols):
# A faster way of getting the max and the max's index without multiple checks
# Combines ('↖', '↑', '←') & the previous values, produces:
# (('↖', val of diag), ('↑', val of above), ('←', val of left)
# and only maxes by the second items.
# Doesn't use touch memory unnecessarily, and does no additional checks
traceback[i][j], grid[i][j] = max(
zip(ARROWS, (0,
grid[i-1][j-1] + sim_table[a[i-1] + b[j-1]],
grid[i-1][j] + gap_penalty,
grid[i][j-1] + gap_penalty)),
key=lambda p: p[1]
)
new_a = ''
new_b = ''
align = ''
x = num_rows - 1
y = num_cols - 1
while traceback[x][y] != NO_DIRECTION:
if x > 0 and y > 0 and traceback[x][y] == DIAGONAL_ARROW:
new_a = a[x-1] + new_a
new_b = b[y-1] + new_b
if a[x-1] == b[y-1]:
align = '|' + align
else:
align = 'X' + align
x -= 1
y -= 1
elif x > 0 and traceback[x][y] == UP_ARROW:
new_a = a[x-1] + new_a
new_b = '_' + new_b
align = ' ' + align
x -= 1
else:
new_b = b[y-1] + new_b
new_a = '_' + new_a
align = ' ' + align
y -= 1
return Alignment(new_a, new_b, align), grid, traceback
def generate_sequence(mer):
seq = ''
for m in range(mer):
seq += random.choice('ACGT')
return seq
def generate_sequences(k, mer):
seqs = []
for i in range(k):
seqs.append(generate_sequence(mer))
return seqs
if __name__ == '__main__':
sim_table = dict()
sim_table["AA"] = 1
sim_table["CC"] = 1
sim_table["GG"] = 1
sim_table["TT"] = 1
sim_table["AC"] = -1
sim_table["AG"] = -1
sim_table["AT"] = -1
sim_table["CA"] = -1
sim_table["CG"] = -1
sim_table["CT"] = -1
sim_table["GA"] = -1
sim_table["GC"] = -1
sim_table["GT"] = -1
sim_table["TA"] = -1
sim_table["TC"] = -1
sim_table["TG"] = -1
initial_k = 1000
mer_len = 100
# Test Run
print("Needleman-Wunsch Test Run: ")
test_seq = "GCATGCT"
test_query = "GATTACA"
print("Seq: ", test_seq)
print("Qry: ", test_query)
print()
test_res = needleman_wunsch(test_seq, test_query, sim_table, -1)
print(test_res[1])
print(test_res[2])
print(test_res[0])
print()
print("Smith-Waterman Test Run: ")
test_seq = "GCATGCT"
test_query = "GATTACA"
print("Seq: ", test_seq)
print("Qry: ", test_query)
print()
test_res = smith_waterman(test_seq, test_query, sim_table, -1)
print(test_res[1])
print(test_res[2])
print(test_res[0])
print()
# Problem 1 A/B
print("Problem 1")
for i in range(4):
k = initial_k * 10**i
print("Running {} {}-mer sequences".format(k, mer_len))
seqs = generate_sequences(k, mer_len)
query = generate_sequence(mer_len)
t = datetime.datetime.now()
for j in range(k):
needleman_wunsch(seqs[j], query, sim_table, -3)
t = datetime.datetime.now() - t
print("{}s".format(t.seconds))
print("Done")
# Problem 2 A/B
print("Problem 2")
for i in range(4):
k = initial_k * 10**i
print("Running {} {}-mer sequences".format(k, mer_len))
seqs = generate_sequences(k, mer_len)
query = generate_sequence(mer_len)
t = datetime.datetime.now()
for j in range(k):
smith_waterman(seqs[j], query, sim_table, -3)
t = datetime.datetime.now() - t
print("{}s".format(t.seconds))
print("Done")
Problem 1 (Needleman-Wunsch):
1k: 14s
10k: 146s = 2m26s
100k: Projected 1460s = 24m20s
1000k: Projected 14600s = 243m20s
Problem 2 (Smith-Waterman):
1k: 12s
10k: 125 = 2m5s
100k: Projected 1250s = 20m50s
1000k: Projected 12500s = 208m20s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment