-
-
Save ktym/7a4799fb055436dc2139308dcab802f0 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby | |
=begin | |
# Pairwise sequence alignment algorithms | |
Currently implements Needleman-Wunsch, Smith-Waterman | |
and Needleman-Wunsch-Gotoh algorithms based on | |
the Dynamic Programming (DP). | |
## Coordinates | |
To treat (i,j) as 1-based coordinates (instead of Ruby's 0-based), | |
use the following accessors: | |
* DP::Matrix#get(i,j), DP::Matrix#set(i,j) | |
* DP::Sequence#[i], DP::Sequence#[j] | |
``` | |
M(i,j) | Target A C C A G T | |
-------+-------------------------------------------------- | |
Query | (0,0) i=1 i=2 i=3 i=4 i=5 i=6 | |
A | j=1 (1,1) (2,1) (3,1) (4,1) (5,1) (6,1) | |
C | j=2 (1,2) (2,2) (3,2) (4,2) (5,2) (6,2) | |
A | j=3 (1,3) (2,3) (3,3) (4,3) (5,3) (6,3) | |
G | j=4 (1,4) (2,4) (3,4) (4,4) (5,4) (6,4) | |
C | j=5 (1,5) (2,5) (3,5) (4,5) (5,5) (6,5) | |
``` | |
## Accessors | |
Each sub-class has the following accessors to modify parameters | |
before the calculation of DP: | |
``` | |
aligner = DP.new(target, query) | |
aligner.gap_penalty = 2 # gap open penalty | |
aligner.ext_penalty = 1 # gap extention penalty | |
aligner.match = 1 | |
aligner.mis_match = -1 | |
``` | |
## Calculation | |
In the DP#dp method, init_matrix, calc_matrix and trace_back functions | |
are called which are required to be implemented in each sub-class. | |
``` | |
aligner.dp | |
# => call init_matrix(), calc_matrix() and trace_back() then print alignment | |
``` | |
## References | |
* Needleman SB, Wunsch CD: A general method applicable to the search for similarities in the amino acid sequence of two proteins. J Mol Biol 1970, 48:443-53. | |
* Smith TF, Waterman MS: Identification of common molecular subsequences. J Mol Biol 1981, 147:195-7. | |
* Gotoh O: An improved algorithm for matching biological sequences. J Mol Biol 1982, 162:705-8. | |
## Notes | |
* [Lecture notes by Terai](http://asailab.cb.k.u-tokyo.ac.jp/wp-content/uploads/2014/06/20181002_terai_pairwisealignment.mod2_.pdf) | |
* [Lecture notes by Asai](http://asailab.cb.k.u-tokyo.ac.jp/asai/lecture/H30Genome3_Alignment.pdf) | |
* [Lecture notes by Yokoyama](https://paper.dropbox.com/doc/--Af886MpyNXIKPsDWulpka5t1Ag-dqPm9SbT1DoLXnSyzu6lo) | |
* [DP basics in Ruby](https://www.jabba.cloud/20161020172918/) | |
## Copyright | |
MIT license, Toshiaki Katayama <ktym@dbcls.jp> | |
=end | |
# Generic interface | |
class DP | |
attr_accessor :gap_penalty, :ext_penalty, :match, :mis_match | |
def initialize(target, query) | |
@gap_penalty = 2 | |
@ext_penalty = 1 | |
@match = 1 | |
@mis_match = -1 | |
@target = Sequence.new(target) | |
@query = Sequence.new(query) | |
end | |
def s(x, y) | |
return x == y ? @match : @mis_match | |
end | |
def init_matrix | |
raise NotImplementedError.new("#{self.class}##{__method__}: Create @matrix = Matrix.new(tlen, qlen)") | |
end | |
def calc_matrix | |
raise NotImplementedError.new("#{self.class}##{__method__}: Update @matrix with .get() and .set()") | |
end | |
def trace_back | |
raise NotImplementedError.new("#{self.class}##{__method__}: Generate @target_align, @query_align") | |
end | |
class Sequence | |
def initialize(seq) | |
@seq = seq | |
end | |
def to_s | |
@seq | |
end | |
def [](one_base_position) | |
zero_base_position = one_base_position - 1 | |
@seq[zero_base_position] | |
end | |
def length | |
@seq.length | |
end | |
end | |
class Matrix | |
def initialize(x, y, default = 0) | |
@matrix = Array.new(x+1) { Array.new(y+1, default) } | |
end | |
def get(i, j) | |
@matrix[j][i] | |
end | |
def set(i, j, v) | |
@matrix[j][i] = v | |
end | |
end | |
def dp | |
puts "Target: #{@target}" | |
puts "Query: #{@query}" | |
init_matrix | |
pp @matrix if $DEBUG | |
calc_matrix | |
pp @matrix | |
trace_back | |
puts "Target: #{@target_align}" | |
puts "Query: #{@query_align}" | |
end | |
end | |
# Global alignment | |
class NeedlemanWunsch < DP | |
def init_matrix | |
@matrix = Matrix.new(@query.length, @target.length, 0) | |
1.upto(@target.length) do |i| | |
value = - (i * @gap_penalty) | |
@matrix.set(i, 0, value) | |
end | |
1.upto(@query.length) do |j| | |
value = - (j * @gap_penalty) | |
@matrix.set(0, j, value) | |
end | |
end | |
def calc_matrix | |
1.upto(@target.length) do |i| | |
1.upto(@query.length) do |j| | |
value = [ | |
@matrix.get(i-1, j-1) + s(@target[i], @query[j]), | |
@matrix.get(i, j-1) - @gap_penalty, | |
@matrix.get(i-1, j ) - @gap_penalty, | |
].max | |
@matrix.set(i, j, value) | |
end | |
end | |
end | |
def trace_back | |
i = @target.length | |
j = @query.length | |
target_trace = "" | |
query_trace = "" | |
score = @matrix.get(i, j) | |
while i > 0 or j > 0 | |
case score | |
when @matrix.get(i-1, j) - @gap_penalty | |
score = @matrix.get(i-1, j) | |
target_trace += @target[i] | |
query_trace += '-' | |
i -= 1 | |
when @matrix.get(i, j-1) - @gap_penalty | |
score = @matrix.get(i, j-1) | |
target_trace += '-' | |
query_trace += @query[j] | |
j -= 1 | |
else | |
score = @matrix.get(i-1, j-1) | |
target_trace += @target[i] | |
query_trace += @query[j] | |
i -= 1 | |
j -= 1 | |
end | |
end | |
@target_align = target_trace.reverse | |
@query_align = query_trace.reverse | |
end | |
end | |
# Local alignment | |
class SmithWaterman < DP | |
def init_matrix | |
@matrix = Matrix.new(@query.length, @target.length, 0) | |
end | |
def calc_matrix | |
@best = {:score => 0, :i => 0, :j => 0} | |
1.upto(@target.length) do |i| | |
1.upto(@query.length) do |j| | |
value = [ | |
@matrix.get(i-1, j-1) + s(@target[i], @query[j]), | |
@matrix.get(i, j-1) - @gap_penalty, | |
@matrix.get(i-1, j ) - @gap_penalty, | |
0, | |
].max | |
@matrix.set(i, j, value) | |
if @matrix.get(i, j) > @best[:score] | |
@best[:score] = @matrix.get(i, j) | |
@best[:i] = i | |
@best[:j] = j | |
end | |
end | |
end | |
end | |
def trace_back | |
i = @best[:i] | |
j = @best[:j] | |
target_trace = "" | |
query_trace = "" | |
score = @matrix.get(i, j) | |
while i > 0 or j > 0 | |
case score | |
when @matrix.get(i-1, j) - @gap_penalty | |
score = @matrix.get(i-1, j) | |
target_trace += @target[i] | |
query_trace += '-' | |
i -= 1 | |
when @matrix.get(i-1, j) - @gap_penalty | |
score = @matrix.get(i, j-1) | |
target_trace += '-' | |
query_trace += @query[j] | |
j -= 1 | |
else | |
score = @matrix.get(i-1, j-1) | |
target_trace += @target[i] | |
query_trace += @query[j] | |
break if score == 0 | |
i -= 1 | |
j -= 1 | |
end | |
end | |
@target_align = target_trace.reverse | |
@query_align = query_trace.reverse | |
end | |
end | |
# Global alignment with affine gap penalty | |
class NeedlemanWunschGotoh < DP | |
Inf = Float::INFINITY | |
def init_matrix | |
@matrix = { | |
:m => Matrix.new(@query.length, @target.length), | |
:x => Matrix.new(@query.length, @target.length), | |
:y => Matrix.new(@query.length, @target.length), | |
} | |
@matrix[:m].set(0, 0, 0) | |
@matrix[:x].set(0, 0, -Inf) | |
@matrix[:y].set(0, 0, -Inf) | |
1.upto(@target.length) do |i| | |
@matrix[:m].set(i, 0, -Inf) | |
@matrix[:y].set(i, 0, -Inf) | |
value = - @gap_penalty - @ext_penalty * (i-1) | |
@matrix[:x].set(i, 0, value) | |
end | |
1.upto(@query.length) do |j| | |
@matrix[:m].set(0, j, -Inf) | |
@matrix[:x].set(0, j, -Inf) | |
value = - @gap_penalty - @ext_penalty * (j-1) | |
@matrix[:y].set(0, j, value) | |
end | |
end | |
def calc_matrix | |
1.upto(@target.length) do |i| | |
1.upto(@query.length) do |j| | |
s_value = s(@target[i], @query[j]) | |
value_m = [ | |
@matrix[:m].get(i-1, j-1) + s_value, | |
@matrix[:x].get(i-1, j-1) + s_value, | |
@matrix[:y].get(i-1, j-1) + s_value, | |
].max | |
@matrix[:m].set(i, j, value_m) | |
value_x = [ | |
@matrix[:m].get(i-1, j) - @gap_penalty, | |
@matrix[:x].get(i-1, j) - @ext_penalty, | |
].max | |
@matrix[:x].set(i, j, value_x) | |
value_y = [ | |
@matrix[:m].get(i, j-1) - @gap_penalty, | |
@matrix[:y].get(i, j-1) - @ext_penalty, | |
].max | |
@matrix[:y].set(i, j, value_y) | |
end | |
end | |
end | |
def trace_back | |
i = @target.length | |
j = @query.length | |
score = { | |
:m => @matrix[:m].get(i, j), | |
:x => @matrix[:x].get(i, j), | |
:y => @matrix[:y].get(i, j), | |
} | |
cur = score.max{|a,b| a[1] <=> b[1]}.first | |
target_trace = "" | |
query_trace = "" | |
while i > 0 or j > 0 | |
score = @matrix[cur].get(i, j) | |
s_value = s(@target[i], @query[j]) | |
case cur | |
when :m | |
case score | |
when @matrix[:m].get(i-1, j-1) + s_value | |
score = @matrix[:m].get(i-1, j-1) | |
cur = :m | |
when @matrix[:x].get(i-1, j-1) + s_value | |
score = @matrix[:x].get(i-1, j-1) | |
cur = :x | |
when @matrix[:y].get(i-1, j-1) + s_value | |
score = @matrix[:y].get(i-1, j-1) | |
cur = :y | |
end | |
target_trace += @target[i] | |
query_trace += @query[j] | |
i -= 1 | |
j -= 1 | |
when :x | |
case score | |
when @matrix[:m].get(i-1, j) - @gap_penalty | |
score = @matrix[:m].get(i-1, j) | |
cur = :m | |
when @matrix[:x].get(i-1, j) - @ext_penalty | |
score = @matrix[:x].get(i-1, j) | |
cur = :x | |
end | |
target_trace += @target[i] | |
query_trace += '-' | |
i -= 1 | |
when :y | |
case score | |
when @matrix[:m].get(i, j-1) - @gap_penalty | |
score = @matrix[:m].get(i, j-1) | |
cur = :m | |
when @matrix[:y].get(i, j-1) - @ext_penalty | |
score = @matrix[:y].get(i, j-1) | |
cur = :y | |
end | |
target_trace += '-' | |
query_trace += @query[j] | |
j -= 1 | |
end | |
end | |
@target_align = target_trace.reverse | |
@query_align = query_trace.reverse | |
end | |
end | |
if __FILE__ == $0 | |
target = ARGV.shift || 'ACCAGT' | |
query = ARGV.shift || 'ACAGC' | |
puts "# Needleman-Wunsch" | |
nw = NeedlemanWunsch.new(target, query) | |
nw.dp | |
puts "# Smith-Waterman" | |
sw = SmithWaterman.new(target, query) | |
sw.dp | |
puts "# Needleman-Wunsch-Gotoh" | |
nwg = NeedlemanWunschGotoh.new(target, query) | |
nwg.dp | |
end | |
ktym
commented
Jun 30, 2019
Bug fixed:)
Needleman-Wunsch
Target: ACCAGT
Query: ACAGC
#<DP::Matrix:0x00007fb6cd885ff0
@matrix=
[[0, -2, -4, -6, -8, -10, -12],
[-2, 1, -1, -3, -5, -7, -9],
[-4, -1, 2, 0, -2, -4, -6],
[-6, -3, 0, 1, 1, -1, -3],
[-8, -5, -2, -1, 0, 2, 0],
[-10, -7, -4, -1, -2, 0, 1]]>
Target: ACCAGT
Query: AC-AGC
Smith-Waterman
Target: ACCAGT
Query: ACAGC
#<DP::Matrix:0x00007fb6cd110d10
@matrix=
[[0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 1, 0, 0],
[0, 0, 2, 1, 0, 0, 0],
[0, 1, 0, 1, 2, 0, 0],
[0, 0, 0, 0, 0, 3, 1],
[0, 0, 1, 1, 0, 1, 2]]>
Target: CAG
Query: CAG
Needleman-Wunsch-Gotoh
Target: ACCAGT
Query: ACAGC
{:m=>
#<DP::Matrix:0x00007fb6cd1366a0
@matrix=
[[0, -Infinity, -Infinity, -Infinity, -Infinity, -Infinity, -Infinity],
[-Infinity, 1, -3, -4, -3, -6, -7],
[-Infinity, -3, 2, 0, -3, -4, -5],
[-Infinity, -2, -2, 1, 1, -2, -3],
[-Infinity, -5, -3, -1, 0, 2, -2],
[-Infinity, -6, -2, 0, -2, -1, 1]]>,
:x=>
#<DP::Matrix:0x00007fb6cd1361a0
@matrix=
[[-Infinity, -2, -3, -4, -5, -6, -7],
[-Infinity, -Infinity, -1, -2, -3, -4, -5],
[-Infinity, -Infinity, -5, 0, -1, -2, -3],
[-Infinity, -Infinity, -4, -4, -1, -1, -2],
[-Infinity, -Infinity, -7, -5, -3, -2, 0],
[-Infinity, -Infinity, -8, -4, -2, -3, -3]]>,
:y=>
#<DP::Matrix:0x00007fb6cd135cc8
@matrix=
[[-Infinity,
-Infinity,
-Infinity,
-Infinity,
-Infinity,
-Infinity,
-Infinity],
[-2, -Infinity, -Infinity, -Infinity, -Infinity, -Infinity, -Infinity],
[-3, -1, -5, -6, -5, -8, -9],
[-4, -2, 0, -2, -5, -6, -7],
[-5, -3, -1, -1, -1, -4, -5],
[-6, -4, -2, -2, -2, 0, -4]]>}
Target: ACCAGT
Query: A-CAGC
Moved to https://github.com/ktym/dp