{{ message }}

Instantly share code, notes, and snippets.

# ktym/dp.rb

Last active Jul 11, 2019
 #!/usr/bin/env ruby =begin # Pairwise sequence alignment algorithms Currently implements Needleman-Wunsch, Smith-Waterman and Needleman-Wunsch-Gotoh algorithms based on the Dynamic Programming (DP). ## Coordinates To treat (i,j) as 1-based coordinates (instead of Ruby's 0-based), use the following accessors: * DP::Matrix#get(i,j), DP::Matrix#set(i,j) * DP::Sequence#[i], DP::Sequence#[j] ``` M(i,j) | Target A C C A G T -------+-------------------------------------------------- Query | (0,0) i=1 i=2 i=3 i=4 i=5 i=6 A | j=1 (1,1) (2,1) (3,1) (4,1) (5,1) (6,1) C | j=2 (1,2) (2,2) (3,2) (4,2) (5,2) (6,2) A | j=3 (1,3) (2,3) (3,3) (4,3) (5,3) (6,3) G | j=4 (1,4) (2,4) (3,4) (4,4) (5,4) (6,4) C | j=5 (1,5) (2,5) (3,5) (4,5) (5,5) (6,5) ``` ## Accessors Each sub-class has the following accessors to modify parameters before the calculation of DP: ``` aligner = DP.new(target, query) aligner.gap_penalty = 2 # gap open penalty aligner.ext_penalty = 1 # gap extention penalty aligner.match = 1 aligner.mis_match = -1 ``` ## Calculation In the DP#dp method, init_matrix, calc_matrix and trace_back functions are called which are required to be implemented in each sub-class. ``` aligner.dp # => call init_matrix(), calc_matrix() and trace_back() then print alignment ``` ## References * Needleman SB, Wunsch CD: A general method applicable to the search for similarities in the amino acid sequence of two proteins. J Mol Biol 1970, 48:443-53. * Smith TF, Waterman MS: Identification of common molecular subsequences. J Mol Biol 1981, 147:195-7. * Gotoh O: An improved algorithm for matching biological sequences. J Mol Biol 1982, 162:705-8. ## Notes * [Lecture notes by Terai](http://asailab.cb.k.u-tokyo.ac.jp/wp-content/uploads/2014/06/20181002_terai_pairwisealignment.mod2_.pdf) * [Lecture notes by Asai](http://asailab.cb.k.u-tokyo.ac.jp/asai/lecture/H30Genome3_Alignment.pdf) * [Lecture notes by Yokoyama](https://paper.dropbox.com/doc/--Af886MpyNXIKPsDWulpka5t1Ag-dqPm9SbT1DoLXnSyzu6lo) * [DP basics in Ruby](https://www.jabba.cloud/20161020172918/) ## Copyright MIT license, Toshiaki Katayama =end # Generic interface class DP attr_accessor :gap_penalty, :ext_penalty, :match, :mis_match def initialize(target, query) @gap_penalty = 2 @ext_penalty = 1 @match = 1 @mis_match = -1 @target = Sequence.new(target) @query = Sequence.new(query) end def s(x, y) return x == y ? @match : @mis_match end def init_matrix raise NotImplementedError.new("#{self.class}##{__method__}: Create @matrix = Matrix.new(tlen, qlen)") end def calc_matrix raise NotImplementedError.new("#{self.class}##{__method__}: Update @matrix with .get() and .set()") end def trace_back raise NotImplementedError.new("#{self.class}##{__method__}: Generate @target_align, @query_align") end class Sequence def initialize(seq) @seq = seq end def to_s @seq end def [](one_base_position) zero_base_position = one_base_position - 1 @seq[zero_base_position] end def length @seq.length end end class Matrix def initialize(x, y, default = 0) @matrix = Array.new(x+1) { Array.new(y+1, default) } end def get(i, j) @matrix[j][i] end def set(i, j, v) @matrix[j][i] = v end end def dp puts "Target: #{@target}" puts "Query: #{@query}" init_matrix pp @matrix if \$DEBUG calc_matrix pp @matrix trace_back puts "Target: #{@target_align}" puts "Query: #{@query_align}" end end # Global alignment class NeedlemanWunsch < DP def init_matrix @matrix = Matrix.new(@query.length, @target.length, 0) 1.upto(@target.length) do |i| value = - (i * @gap_penalty) @matrix.set(i, 0, value) end 1.upto(@query.length) do |j| value = - (j * @gap_penalty) @matrix.set(0, j, value) end end def calc_matrix 1.upto(@target.length) do |i| 1.upto(@query.length) do |j| value = [ @matrix.get(i-1, j-1) + s(@target[i], @query[j]), @matrix.get(i, j-1) - @gap_penalty, @matrix.get(i-1, j ) - @gap_penalty, ].max @matrix.set(i, j, value) end end end def trace_back i = @target.length j = @query.length target_trace = "" query_trace = "" score = @matrix.get(i, j) while i > 0 or j > 0 case score when @matrix.get(i-1, j) - @gap_penalty score = @matrix.get(i-1, j) target_trace += @target[i] query_trace += '-' i -= 1 when @matrix.get(i, j-1) - @gap_penalty score = @matrix.get(i, j-1) target_trace += '-' query_trace += @query[j] j -= 1 else score = @matrix.get(i-1, j-1) target_trace += @target[i] query_trace += @query[j] i -= 1 j -= 1 end end @target_align = target_trace.reverse @query_align = query_trace.reverse end end # Local alignment class SmithWaterman < DP def init_matrix @matrix = Matrix.new(@query.length, @target.length, 0) end def calc_matrix @best = {:score => 0, :i => 0, :j => 0} 1.upto(@target.length) do |i| 1.upto(@query.length) do |j| value = [ @matrix.get(i-1, j-1) + s(@target[i], @query[j]), @matrix.get(i, j-1) - @gap_penalty, @matrix.get(i-1, j ) - @gap_penalty, 0, ].max @matrix.set(i, j, value) if @matrix.get(i, j) > @best[:score] @best[:score] = @matrix.get(i, j) @best[:i] = i @best[:j] = j end end end end def trace_back i = @best[:i] j = @best[:j] target_trace = "" query_trace = "" score = @matrix.get(i, j) while i > 0 or j > 0 case score when @matrix.get(i-1, j) - @gap_penalty score = @matrix.get(i-1, j) target_trace += @target[i] query_trace += '-' i -= 1 when @matrix.get(i-1, j) - @gap_penalty score = @matrix.get(i, j-1) target_trace += '-' query_trace += @query[j] j -= 1 else score = @matrix.get(i-1, j-1) target_trace += @target[i] query_trace += @query[j] break if score == 0 i -= 1 j -= 1 end end @target_align = target_trace.reverse @query_align = query_trace.reverse end end # Global alignment with affine gap penalty class NeedlemanWunschGotoh < DP Inf = Float::INFINITY def init_matrix @matrix = { :m => Matrix.new(@query.length, @target.length), :x => Matrix.new(@query.length, @target.length), :y => Matrix.new(@query.length, @target.length), } @matrix[:m].set(0, 0, 0) @matrix[:x].set(0, 0, -Inf) @matrix[:y].set(0, 0, -Inf) 1.upto(@target.length) do |i| @matrix[:m].set(i, 0, -Inf) @matrix[:y].set(i, 0, -Inf) value = - @gap_penalty - @ext_penalty * (i-1) @matrix[:x].set(i, 0, value) end 1.upto(@query.length) do |j| @matrix[:m].set(0, j, -Inf) @matrix[:x].set(0, j, -Inf) value = - @gap_penalty - @ext_penalty * (j-1) @matrix[:y].set(0, j, value) end end def calc_matrix 1.upto(@target.length) do |i| 1.upto(@query.length) do |j| s_value = s(@target[i], @query[j]) value_m = [ @matrix[:m].get(i-1, j-1) + s_value, @matrix[:x].get(i-1, j-1) + s_value, @matrix[:y].get(i-1, j-1) + s_value, ].max @matrix[:m].set(i, j, value_m) value_x = [ @matrix[:m].get(i-1, j) - @gap_penalty, @matrix[:x].get(i-1, j) - @ext_penalty, ].max @matrix[:x].set(i, j, value_x) value_y = [ @matrix[:m].get(i, j-1) - @gap_penalty, @matrix[:y].get(i, j-1) - @ext_penalty, ].max @matrix[:y].set(i, j, value_y) end end end def trace_back i = @target.length j = @query.length score = { :m => @matrix[:m].get(i, j), :x => @matrix[:x].get(i, j), :y => @matrix[:y].get(i, j), } cur = score.max{|a,b| a[1] <=> b[1]}.first target_trace = "" query_trace = "" while i > 0 or j > 0 score = @matrix[cur].get(i, j) s_value = s(@target[i], @query[j]) case cur when :m case score when @matrix[:m].get(i-1, j-1) + s_value score = @matrix[:m].get(i-1, j-1) cur = :m when @matrix[:x].get(i-1, j-1) + s_value score = @matrix[:x].get(i-1, j-1) cur = :x when @matrix[:y].get(i-1, j-1) + s_value score = @matrix[:y].get(i-1, j-1) cur = :y end target_trace += @target[i] query_trace += @query[j] i -= 1 j -= 1 when :x case score when @matrix[:m].get(i-1, j) - @gap_penalty score = @matrix[:m].get(i-1, j) cur = :m when @matrix[:x].get(i-1, j) - @ext_penalty score = @matrix[:x].get(i-1, j) cur = :x end target_trace += @target[i] query_trace += '-' i -= 1 when :y case score when @matrix[:m].get(i, j-1) - @gap_penalty score = @matrix[:m].get(i, j-1) cur = :m when @matrix[:y].get(i, j-1) - @ext_penalty score = @matrix[:y].get(i, j-1) cur = :y end target_trace += '-' query_trace += @query[j] j -= 1 end end @target_align = target_trace.reverse @query_align = query_trace.reverse end end if __FILE__ == \$0 target = ARGV.shift || 'ACCAGT' query = ARGV.shift || 'ACAGC' puts "# Needleman-Wunsch" nw = NeedlemanWunsch.new(target, query) nw.dp puts "# Smith-Waterman" sw = SmithWaterman.new(target, query) sw.dp puts "# Needleman-Wunsch-Gotoh" nwg = NeedlemanWunschGotoh.new(target, query) nwg.dp end

### ktym commented Jun 30, 2019

 ``````# Needleman-Wunsch Target: ACCAGT Query: ACAGC # # Target: ACCAGT Query: A-CAGC # Smith-Waterman Target: ACCAGT Query: ACAGC # # Target: CAG Query: CAG # Needleman-Wunsch-Gotoh Target: ACCAGT Query: ACAGC {:m=> #, :x=> #, :y=> #} {:m=> #, :x=> #, :y=> #} Target: ACCAGT Query: -ACAGC ``````

Bug fixed:)

# Needleman-Wunsch

Target: ACCAGT
Query: ACAGC
#<DP::Matrix:0x00007fb6cd885ff0
@matrix=
[[0, -2, -4, -6, -8, -10, -12],
[-2, 1, -1, -3, -5, -7, -9],
[-4, -1, 2, 0, -2, -4, -6],
[-6, -3, 0, 1, 1, -1, -3],
[-8, -5, -2, -1, 0, 2, 0],
[-10, -7, -4, -1, -2, 0, 1]]>
Target: ACCAGT
Query: AC-AGC

# Smith-Waterman

Target: ACCAGT
Query: ACAGC
#<DP::Matrix:0x00007fb6cd110d10
@matrix=
[[0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 1, 0, 0],
[0, 0, 2, 1, 0, 0, 0],
[0, 1, 0, 1, 2, 0, 0],
[0, 0, 0, 0, 0, 3, 1],
[0, 0, 1, 1, 0, 1, 2]]>
Target: CAG
Query: CAG

# Needleman-Wunsch-Gotoh

Target: ACCAGT
Query: ACAGC
{:m=>
#<DP::Matrix:0x00007fb6cd1366a0
@matrix=
[[0, -Infinity, -Infinity, -Infinity, -Infinity, -Infinity, -Infinity],
[-Infinity, 1, -3, -4, -3, -6, -7],
[-Infinity, -3, 2, 0, -3, -4, -5],
[-Infinity, -2, -2, 1, 1, -2, -3],
[-Infinity, -5, -3, -1, 0, 2, -2],
[-Infinity, -6, -2, 0, -2, -1, 1]]>,
:x=>
#<DP::Matrix:0x00007fb6cd1361a0
@matrix=
[[-Infinity, -2, -3, -4, -5, -6, -7],
[-Infinity, -Infinity, -1, -2, -3, -4, -5],
[-Infinity, -Infinity, -5, 0, -1, -2, -3],
[-Infinity, -Infinity, -4, -4, -1, -1, -2],
[-Infinity, -Infinity, -7, -5, -3, -2, 0],
[-Infinity, -Infinity, -8, -4, -2, -3, -3]]>,
:y=>
#<DP::Matrix:0x00007fb6cd135cc8
@matrix=
[[-Infinity,
-Infinity,
-Infinity,
-Infinity,
-Infinity,
-Infinity,
-Infinity],
[-2, -Infinity, -Infinity, -Infinity, -Infinity, -Infinity, -Infinity],
[-3, -1, -5, -6, -5, -8, -9],
[-4, -2, 0, -2, -5, -6, -7],
[-5, -3, -1, -1, -1, -4, -5],
[-6, -4, -2, -2, -2, 0, -4]]>}
Target: ACCAGT
Query: A-CAGC

### ktym commented Jul 11, 2019

 Moved to https://github.com/ktym/dp