Created
November 20, 2012 21:52
-
-
Save avrilcoghlan/4121463 to your computer and use it in GitHub Desktop.
Perl script to get overlap alignment
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/perl -w | |
# Avril Coghlan | |
# 20-Nov-2012 | |
# Algorithm on page 204 of 'Computational Genome Analysis' by Deonier et al | |
# define the two sequences | |
$seqA = "AGGCTAAA"; | |
$seqB = "CAAACGTCT"; | |
print "Sequences:\n"; | |
print "A: $seqA\n"; | |
print "B: $seqB\n\n"; | |
# define the scores for match, mismatch and indel: | |
$match_score = 2; | |
$mismatch_score = -1; | |
$gapopen_score = 0; | |
$gapextend_score= -2; | |
# find the overlap alignment between $seqA and $seqB: | |
&find_overlap_alignment($seqA,$seqB); | |
sub find_overlap_alignment | |
{ | |
my $seqA = $_[0]; # first sequence | |
my $seqB = $_[1]; # second sequence | |
my $len_seqA; # length of $seqA | |
my $len_seqB; # length of $seqB | |
my @array; # array for storing the dynamic programming matrix | |
my @traceback; # array for storing the traceback | |
# find the length of the two sequences: | |
$len_seqA = length($seqA); | |
$len_seqB = length($seqB); | |
# define a two-dimensional matrix @array with $len_seqA + 1 rows, and $len_seqB + 1 | |
# columns, and initialise it with zeroes | |
for $x (1 .. ($len_seqA + 1)) | |
{ # For each row... | |
for $y (1 .. ($len_seqB + 1)) | |
{ # For each column... | |
$array[$x-1][$y-1] = 0; # we need to use indices x-1, y-1 as perl indices arrays from 0,1,2... | |
} | |
} | |
# define a two-dimensional matrix @traceback with $len_seqA + 1 rows, and $len_seqB + 1 | |
# columns, and initialise it with zeroes: | |
for $x (1 .. $len_seqA + 1) | |
{ # For each row... | |
for $y (1 .. $len_seqB + 1) | |
{ # For each column... | |
$traceback[$x-1][$y-1] = 0; # we need to use indices x-1, y-1 as perl indices arrays from 0,1,2... | |
} | |
} | |
# put "."s in the first row of matrix @traceback: | |
for $y (1 .. $len_seqB + 1) | |
{ # For each column... | |
$traceback[0][$y-1] = "."; | |
} | |
# put "."s in the first column of matrix @traceback: | |
for $x (1 .. $len_seqA + 1) | |
{ # For each row... | |
$traceback[$x-1][0] = "."; | |
} | |
# do the dynamic programming recursion | |
for $x (2 .. $len_seqA + 1) | |
{ # For each row... | |
# find the letter at position $x-1 in $seqA (seqA goes down the first column): | |
$seqA_letter = substr($seqA,($x-2),1); | |
for $y (2 .. $len_seqB + 1) | |
{ # For each column... | |
# find the letter at position $y-1 in $seqB (seqB goes across the first row): | |
$seqB_letter = substr($seqB,($y-2),1); | |
# find the value to put into $array[$x-1][$y-1]: | |
# (i) the first possibility is to take the diagonal element plus a match/mismatch score: | |
if ($seqA_letter eq $seqB_letter) { $score = $match_score; } | |
else { $score = $mismatch_score;} | |
$diag = $array[$x-2][$y-2] + $score; | |
# (ii) the second possibility is to take the element above plus a gap score: | |
if ($traceback[$x-2][$y-1] eq "|") { $up = $array[$x-2][$y-1] + $gapextend_score; } | |
else { $up = $array[$x-2][$y-1] + $gapextend_score + $gapopen_score; } | |
# (iii) the third possibility is to take the element on the left plus a gap score: | |
if ($traceback[$x-1][$y-2] eq "-") { $left = $array[$x-1][$y-2] + $gapextend_score; } | |
else { $left = $array[$x-1][$y-2] + $gapextend_score + $gapopen_score;} | |
# record which of the three possibilities was highest, in @traceback: | |
if ($diag > $up && $diag > $left) { $traceback[$x-1][$y-1] = ">"; $max = $diag;} | |
elsif ($up > $diag && $up > $left) { $traceback[$x-1][$y-1] = "|"; $max = $up; } | |
elsif ($left > $diag && $left > $up) { $traceback[$x-1][$y-1] = "-"; $max = $left;} | |
elsif ($left == $diag && $up == $diag) { $traceback[$x-1][$y-1] = "*"; $max = $diag;} | |
elsif ($up == $left && $up > $diag && $left > $diag) { $traceback[$x-1][$y-1] = "L"; $max = $up; } | |
elsif ($up == $diag && $up > $left && $diag > $left) { $traceback[$x-1][$y-1] = "V"; $max = $up; } | |
elsif ($diag == $left && $diag > $up && $left > $up) { $traceback[$x-1][$y-1] = "Z"; $max = $diag;} | |
# record the highest score in @array: | |
$array[$x-1][$y-1] = $max; | |
} | |
} | |
# the best overlap is the given by the largest number in the right-most column or bottom-most | |
# row of the matrix @matrix: | |
$best_x = "-1"; | |
$best_y = "-1"; | |
$best_value = "-10000000000000"; | |
for $x (1 .. $len_seqA + 1) | |
{ # For each row... | |
$value = $array[$x-1][$len_seqB]; | |
if ($value > $best_value) { $best_x = $x-1; $best_y = $len_seqB; $best_value = $value;} | |
} | |
for $y (1 .. $len_seqB + 1) | |
{ # For each column... | |
$value = $array[$len_seqA][$y-1]; | |
if ($value > $best_value) { $best_x = $len_seqA; $best_y = $y-1; $best_value = $value;} | |
} | |
# record the traceback as '#': | |
$seqA_aln = ""; | |
$seqB_aln = ""; | |
$tracebackvalue = $traceback[$best_x][$best_y]; | |
while($tracebackvalue ne ".") | |
{ | |
$traceback[$best_x][$best_y] = "#"; | |
if ($tracebackvalue eq '>') | |
{ | |
$seqA_letter = substr($seqA,$best_x-1,1); | |
$seqB_letter = substr($seqB,$best_y-1,1); | |
$best_x = $best_x - 1; | |
$best_y = $best_y - 1; | |
} | |
elsif ($tracebackvalue eq '-') | |
{ | |
$seqA_letter = "-"; | |
$seqB_letter = substr($seqB,$best_y-1,1); | |
$best_y = $best_y - 1; | |
} | |
elsif ($tracebackvalue eq '|') | |
{ | |
$seqA_letter = substr($seqA,$best_x-1,1); | |
$seqB_letter = "-"; | |
$best_x = $best_x - 1; | |
} | |
else { print STDERR "ERROR: tracebackvalue $tracebackvalue: cannot deal with branches in traceback\n"; exit; } | |
$seqA_aln = $seqA_letter.$seqA_aln; | |
$seqB_aln = $seqB_letter.$seqB_aln; | |
# find the new traceback value: | |
$tracebackvalue = $traceback[$best_x][$best_y]; | |
} | |
# put $seqB in the first row of matrix @matrix: | |
for $y (1 .. $len_seqB + 1) | |
{ # For each column... | |
# find the letter at position $y-1 in $seqB: | |
if ($y > 1) | |
{ | |
$seqB_letter = substr($seqB,($y-2),1); | |
$array[0][$y-1] = $seqB_letter; | |
} | |
} | |
# put $seqA in the first column of matrix @matrix: | |
for $x (1 .. $len_seqA + 1) | |
{ # For each row... | |
# find the letter at position $x-1 in $seqA: | |
if ($x > 1) | |
{ | |
$seqA_letter = substr($seqA,($x-2),1); | |
$array[$x-1][0] = $seqA_letter; | |
} | |
} | |
# print out the result: | |
print "Overlap matrix, with traceback shown as #:\n"; | |
for $x (1 .. $len_seqA + 1) | |
{ # For each row... | |
for $y (1 .. $len_seqB + 1) | |
{ # For each column... | |
$arrayvalue = $array[$x-1][$y-1]; | |
$tracebackvalue = $traceback[$x-1][$y-1]; | |
print "$arrayvalue $tracebackvalue\t"; | |
} | |
print "\n"; | |
} | |
print "\n"; | |
# print out the overlap alignment: | |
print "Overlap alignment (score = $best_value):\n"; | |
print "A: $seqA_aln\n"; | |
print "B: $seqB_aln\n"; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment