Last active
July 14, 2022 08:13
-
-
Save ktym/7a4799fb055436dc2139308dcab802f0 to your computer and use it in GitHub Desktop.
See https://github.com/ktym/dp for newer version.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
=begin | |
# Pairwise sequence alignment algorithms | |
Currently implements Needleman-Wunsch, Smith-Waterman | |
and Needleman-Wunsch-Gotoh algorithms based on | |
the Dynamic Programming (DP). | |
## Coordinates | |
To treat (i,j) as 1-based coordinates (instead of Ruby's 0-based), | |
use the following accessors: | |
* DP::Matrix#get(i,j), DP::Matrix#set(i,j) | |
* DP::Sequence#[i], DP::Sequence#[j] | |
``` | |
M(i,j) | Target A C C A G T | |
-------+-------------------------------------------------- | |
Query | (0,0) i=1 i=2 i=3 i=4 i=5 i=6 | |
A | j=1 (1,1) (2,1) (3,1) (4,1) (5,1) (6,1) | |
C | j=2 (1,2) (2,2) (3,2) (4,2) (5,2) (6,2) | |
A | j=3 (1,3) (2,3) (3,3) (4,3) (5,3) (6,3) | |
G | j=4 (1,4) (2,4) (3,4) (4,4) (5,4) (6,4) | |
C | j=5 (1,5) (2,5) (3,5) (4,5) (5,5) (6,5) | |
``` | |
## Accessors | |
Each sub-class has the following accessors to modify parameters | |
before the calculation of DP: | |
``` | |
aligner = DP.new(target, query) | |
aligner.gap_penalty = 2 # gap open penalty | |
aligner.ext_penalty = 1 # gap extention penalty | |
aligner.match = 1 | |
aligner.mis_match = -1 | |
``` | |
## Calculation | |
In the DP#dp method, init_matrix, calc_matrix and trace_back functions | |
are called which are required to be implemented in each sub-class. | |
``` | |
aligner.dp | |
# => call init_matrix(), calc_matrix() and trace_back() then print alignment | |
``` | |
## References | |
* Needleman SB, Wunsch CD: A general method applicable to the search for similarities in the amino acid sequence of two proteins. J Mol Biol 1970, 48:443-53. | |
* Smith TF, Waterman MS: Identification of common molecular subsequences. J Mol Biol 1981, 147:195-7. | |
* Gotoh O: An improved algorithm for matching biological sequences. J Mol Biol 1982, 162:705-8. | |
## Notes | |
* [Lecture notes by Terai](http://asailab.cb.k.u-tokyo.ac.jp/wp-content/uploads/2014/06/20181002_terai_pairwisealignment.mod2_.pdf) | |
* [Lecture notes by Asai](http://asailab.cb.k.u-tokyo.ac.jp/asai/lecture/H30Genome3_Alignment.pdf) | |
* [Lecture notes by Yokoyama](https://paper.dropbox.com/doc/--Af886MpyNXIKPsDWulpka5t1Ag-dqPm9SbT1DoLXnSyzu6lo) | |
* [DP basics in Ruby](https://www.jabba.cloud/20161020172918/) | |
## Copyright | |
MIT license, Toshiaki Katayama <ktym@dbcls.jp> | |
=end | |
# Generic interface | |
class DP | |
attr_accessor :gap_penalty, :ext_penalty, :match, :mis_match | |
def initialize(target, query) | |
@gap_penalty = 2 | |
@ext_penalty = 1 | |
@match = 1 | |
@mis_match = -1 | |
@target = Sequence.new(target) | |
@query = Sequence.new(query) | |
end | |
def s(x, y) | |
return x == y ? @match : @mis_match | |
end | |
def init_matrix | |
raise NotImplementedError.new("#{self.class}##{__method__}: Create @matrix = Matrix.new(tlen, qlen)") | |
end | |
def calc_matrix | |
raise NotImplementedError.new("#{self.class}##{__method__}: Update @matrix with .get() and .set()") | |
end | |
def trace_back | |
raise NotImplementedError.new("#{self.class}##{__method__}: Generate @target_align, @query_align") | |
end | |
class Sequence | |
def initialize(seq) | |
@seq = seq | |
end | |
def to_s | |
@seq | |
end | |
def [](one_base_position) | |
zero_base_position = one_base_position - 1 | |
@seq[zero_base_position] | |
end | |
def length | |
@seq.length | |
end | |
end | |
class Matrix | |
def initialize(x, y, default = 0) | |
@matrix = Array.new(x+1) { Array.new(y+1, default) } | |
end | |
def get(i, j) | |
@matrix[j][i] | |
end | |
def set(i, j, v) | |
@matrix[j][i] = v | |
end | |
end | |
def dp | |
puts "Target: #{@target}" | |
puts "Query: #{@query}" | |
init_matrix | |
pp @matrix if $DEBUG | |
calc_matrix | |
pp @matrix | |
trace_back | |
puts "Target: #{@target_align}" | |
puts "Query: #{@query_align}" | |
end | |
end | |
# Global alignment | |
class NeedlemanWunsch < DP | |
def init_matrix | |
@matrix = Matrix.new(@query.length, @target.length, 0) | |
1.upto(@target.length) do |i| | |
value = - (i * @gap_penalty) | |
@matrix.set(i, 0, value) | |
end | |
1.upto(@query.length) do |j| | |
value = - (j * @gap_penalty) | |
@matrix.set(0, j, value) | |
end | |
end | |
def calc_matrix | |
1.upto(@target.length) do |i| | |
1.upto(@query.length) do |j| | |
value = [ | |
@matrix.get(i-1, j-1) + s(@target[i], @query[j]), | |
@matrix.get(i, j-1) - @gap_penalty, | |
@matrix.get(i-1, j ) - @gap_penalty, | |
].max | |
@matrix.set(i, j, value) | |
end | |
end | |
end | |
def trace_back | |
i = @target.length | |
j = @query.length | |
target_trace = "" | |
query_trace = "" | |
score = @matrix.get(i, j) | |
while i > 0 or j > 0 | |
case score | |
when @matrix.get(i-1, j) - @gap_penalty | |
score = @matrix.get(i-1, j) | |
target_trace += @target[i] | |
query_trace += '-' | |
i -= 1 | |
when @matrix.get(i, j-1) - @gap_penalty | |
score = @matrix.get(i, j-1) | |
target_trace += '-' | |
query_trace += @query[j] | |
j -= 1 | |
else | |
score = @matrix.get(i-1, j-1) | |
target_trace += @target[i] | |
query_trace += @query[j] | |
i -= 1 | |
j -= 1 | |
end | |
end | |
@target_align = target_trace.reverse | |
@query_align = query_trace.reverse | |
end | |
end | |
# Local alignment | |
class SmithWaterman < DP | |
def init_matrix | |
@matrix = Matrix.new(@query.length, @target.length, 0) | |
end | |
def calc_matrix | |
@best = {:score => 0, :i => 0, :j => 0} | |
1.upto(@target.length) do |i| | |
1.upto(@query.length) do |j| | |
value = [ | |
@matrix.get(i-1, j-1) + s(@target[i], @query[j]), | |
@matrix.get(i, j-1) - @gap_penalty, | |
@matrix.get(i-1, j ) - @gap_penalty, | |
0, | |
].max | |
@matrix.set(i, j, value) | |
if @matrix.get(i, j) > @best[:score] | |
@best[:score] = @matrix.get(i, j) | |
@best[:i] = i | |
@best[:j] = j | |
end | |
end | |
end | |
end | |
def trace_back | |
i = @best[:i] | |
j = @best[:j] | |
target_trace = "" | |
query_trace = "" | |
score = @matrix.get(i, j) | |
while i > 0 or j > 0 | |
case score | |
when @matrix.get(i-1, j) - @gap_penalty | |
score = @matrix.get(i-1, j) | |
target_trace += @target[i] | |
query_trace += '-' | |
i -= 1 | |
when @matrix.get(i-1, j) - @gap_penalty | |
score = @matrix.get(i, j-1) | |
target_trace += '-' | |
query_trace += @query[j] | |
j -= 1 | |
else | |
score = @matrix.get(i-1, j-1) | |
target_trace += @target[i] | |
query_trace += @query[j] | |
break if score == 0 | |
i -= 1 | |
j -= 1 | |
end | |
end | |
@target_align = target_trace.reverse | |
@query_align = query_trace.reverse | |
end | |
end | |
# Global alignment with affine gap penalty | |
class NeedlemanWunschGotoh < DP | |
Inf = Float::INFINITY | |
def init_matrix | |
@matrix = { | |
:m => Matrix.new(@query.length, @target.length), | |
:x => Matrix.new(@query.length, @target.length), | |
:y => Matrix.new(@query.length, @target.length), | |
} | |
@matrix[:m].set(0, 0, 0) | |
@matrix[:x].set(0, 0, -Inf) | |
@matrix[:y].set(0, 0, -Inf) | |
1.upto(@target.length) do |i| | |
@matrix[:m].set(i, 0, -Inf) | |
@matrix[:y].set(i, 0, -Inf) | |
value = - @gap_penalty - @ext_penalty * (i-1) | |
@matrix[:x].set(i, 0, value) | |
end | |
1.upto(@query.length) do |j| | |
@matrix[:m].set(0, j, -Inf) | |
@matrix[:x].set(0, j, -Inf) | |
value = - @gap_penalty - @ext_penalty * (j-1) | |
@matrix[:y].set(0, j, value) | |
end | |
end | |
def calc_matrix | |
1.upto(@target.length) do |i| | |
1.upto(@query.length) do |j| | |
s_value = s(@target[i], @query[j]) | |
value_m = [ | |
@matrix[:m].get(i-1, j-1) + s_value, | |
@matrix[:x].get(i-1, j-1) + s_value, | |
@matrix[:y].get(i-1, j-1) + s_value, | |
].max | |
@matrix[:m].set(i, j, value_m) | |
value_x = [ | |
@matrix[:m].get(i-1, j) - @gap_penalty, | |
@matrix[:x].get(i-1, j) - @ext_penalty, | |
].max | |
@matrix[:x].set(i, j, value_x) | |
value_y = [ | |
@matrix[:m].get(i, j-1) - @gap_penalty, | |
@matrix[:y].get(i, j-1) - @ext_penalty, | |
].max | |
@matrix[:y].set(i, j, value_y) | |
end | |
end | |
end | |
def trace_back | |
i = @target.length | |
j = @query.length | |
score = { | |
:m => @matrix[:m].get(i, j), | |
:x => @matrix[:x].get(i, j), | |
:y => @matrix[:y].get(i, j), | |
} | |
cur = score.max{|a,b| a[1] <=> b[1]}.first | |
target_trace = "" | |
query_trace = "" | |
while i > 0 or j > 0 | |
score = @matrix[cur].get(i, j) | |
s_value = s(@target[i], @query[j]) | |
case cur | |
when :m | |
case score | |
when @matrix[:m].get(i-1, j-1) + s_value | |
score = @matrix[:m].get(i-1, j-1) | |
cur = :m | |
when @matrix[:x].get(i-1, j-1) + s_value | |
score = @matrix[:x].get(i-1, j-1) | |
cur = :x | |
when @matrix[:y].get(i-1, j-1) + s_value | |
score = @matrix[:y].get(i-1, j-1) | |
cur = :y | |
end | |
target_trace += @target[i] | |
query_trace += @query[j] | |
i -= 1 | |
j -= 1 | |
when :x | |
case score | |
when @matrix[:m].get(i-1, j) - @gap_penalty | |
score = @matrix[:m].get(i-1, j) | |
cur = :m | |
when @matrix[:x].get(i-1, j) - @ext_penalty | |
score = @matrix[:x].get(i-1, j) | |
cur = :x | |
end | |
target_trace += @target[i] | |
query_trace += '-' | |
i -= 1 | |
when :y | |
case score | |
when @matrix[:m].get(i, j-1) - @gap_penalty | |
score = @matrix[:m].get(i, j-1) | |
cur = :m | |
when @matrix[:y].get(i, j-1) - @ext_penalty | |
score = @matrix[:y].get(i, j-1) | |
cur = :y | |
end | |
target_trace += '-' | |
query_trace += @query[j] | |
j -= 1 | |
end | |
end | |
@target_align = target_trace.reverse | |
@query_align = query_trace.reverse | |
end | |
end | |
if __FILE__ == $0 | |
target = ARGV.shift || 'ACCAGT' | |
query = ARGV.shift || 'ACAGC' | |
puts "# Needleman-Wunsch" | |
nw = NeedlemanWunsch.new(target, query) | |
nw.dp | |
puts "# Smith-Waterman" | |
sw = SmithWaterman.new(target, query) | |
sw.dp | |
puts "# Needleman-Wunsch-Gotoh" | |
nwg = NeedlemanWunschGotoh.new(target, query) | |
nwg.dp | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Bug fixed:)
Needleman-Wunsch
Target: ACCAGT
Query: ACAGC
#<DP::Matrix:0x00007fb6cd885ff0
@matrix=
[[0, -2, -4, -6, -8, -10, -12],
[-2, 1, -1, -3, -5, -7, -9],
[-4, -1, 2, 0, -2, -4, -6],
[-6, -3, 0, 1, 1, -1, -3],
[-8, -5, -2, -1, 0, 2, 0],
[-10, -7, -4, -1, -2, 0, 1]]>
Target: ACCAGT
Query: AC-AGC
Smith-Waterman
Target: ACCAGT
Query: ACAGC
#<DP::Matrix:0x00007fb6cd110d10
@matrix=
[[0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 1, 0, 0],
[0, 0, 2, 1, 0, 0, 0],
[0, 1, 0, 1, 2, 0, 0],
[0, 0, 0, 0, 0, 3, 1],
[0, 0, 1, 1, 0, 1, 2]]>
Target: CAG
Query: CAG
Needleman-Wunsch-Gotoh
Target: ACCAGT
Query: ACAGC
{:m=>
#<DP::Matrix:0x00007fb6cd1366a0
@matrix=
[[0, -Infinity, -Infinity, -Infinity, -Infinity, -Infinity, -Infinity],
[-Infinity, 1, -3, -4, -3, -6, -7],
[-Infinity, -3, 2, 0, -3, -4, -5],
[-Infinity, -2, -2, 1, 1, -2, -3],
[-Infinity, -5, -3, -1, 0, 2, -2],
[-Infinity, -6, -2, 0, -2, -1, 1]]>,
:x=>
#<DP::Matrix:0x00007fb6cd1361a0
@matrix=
[[-Infinity, -2, -3, -4, -5, -6, -7],
[-Infinity, -Infinity, -1, -2, -3, -4, -5],
[-Infinity, -Infinity, -5, 0, -1, -2, -3],
[-Infinity, -Infinity, -4, -4, -1, -1, -2],
[-Infinity, -Infinity, -7, -5, -3, -2, 0],
[-Infinity, -Infinity, -8, -4, -2, -3, -3]]>,
:y=>
#<DP::Matrix:0x00007fb6cd135cc8
@matrix=
[[-Infinity,
-Infinity,
-Infinity,
-Infinity,
-Infinity,
-Infinity,
-Infinity],
[-2, -Infinity, -Infinity, -Infinity, -Infinity, -Infinity, -Infinity],
[-3, -1, -5, -6, -5, -8, -9],
[-4, -2, 0, -2, -5, -6, -7],
[-5, -3, -1, -1, -1, -4, -5],
[-6, -4, -2, -2, -2, 0, -4]]>}
Target: ACCAGT
Query: A-CAGC