Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created November 20, 2012 21:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/4121463 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/4121463 to your computer and use it in GitHub Desktop.
Perl script to get overlap alignment
#! /usr/bin/perl -w
# Avril Coghlan
# 20-Nov-2012
# Algorithm on page 204 of 'Computational Genome Analysis' by Deonier et al
# define the two sequences
$seqA = "AGGCTAAA";
$seqB = "CAAACGTCT";
print "Sequences:\n";
print "A: $seqA\n";
print "B: $seqB\n\n";
# define the scores for match, mismatch and indel:
$match_score = 2;
$mismatch_score = -1;
$gapopen_score = 0;
$gapextend_score= -2;
# find the overlap alignment between $seqA and $seqB:
&find_overlap_alignment($seqA,$seqB);
sub find_overlap_alignment
{
my $seqA = $_[0]; # first sequence
my $seqB = $_[1]; # second sequence
my $len_seqA; # length of $seqA
my $len_seqB; # length of $seqB
my @array; # array for storing the dynamic programming matrix
my @traceback; # array for storing the traceback
# find the length of the two sequences:
$len_seqA = length($seqA);
$len_seqB = length($seqB);
# define a two-dimensional matrix @array with $len_seqA + 1 rows, and $len_seqB + 1
# columns, and initialise it with zeroes
for $x (1 .. ($len_seqA + 1))
{ # For each row...
for $y (1 .. ($len_seqB + 1))
{ # For each column...
$array[$x-1][$y-1] = 0; # we need to use indices x-1, y-1 as perl indices arrays from 0,1,2...
}
}
# define a two-dimensional matrix @traceback with $len_seqA + 1 rows, and $len_seqB + 1
# columns, and initialise it with zeroes:
for $x (1 .. $len_seqA + 1)
{ # For each row...
for $y (1 .. $len_seqB + 1)
{ # For each column...
$traceback[$x-1][$y-1] = 0; # we need to use indices x-1, y-1 as perl indices arrays from 0,1,2...
}
}
# put "."s in the first row of matrix @traceback:
for $y (1 .. $len_seqB + 1)
{ # For each column...
$traceback[0][$y-1] = ".";
}
# put "."s in the first column of matrix @traceback:
for $x (1 .. $len_seqA + 1)
{ # For each row...
$traceback[$x-1][0] = ".";
}
# do the dynamic programming recursion
for $x (2 .. $len_seqA + 1)
{ # For each row...
# find the letter at position $x-1 in $seqA (seqA goes down the first column):
$seqA_letter = substr($seqA,($x-2),1);
for $y (2 .. $len_seqB + 1)
{ # For each column...
# find the letter at position $y-1 in $seqB (seqB goes across the first row):
$seqB_letter = substr($seqB,($y-2),1);
# find the value to put into $array[$x-1][$y-1]:
# (i) the first possibility is to take the diagonal element plus a match/mismatch score:
if ($seqA_letter eq $seqB_letter) { $score = $match_score; }
else { $score = $mismatch_score;}
$diag = $array[$x-2][$y-2] + $score;
# (ii) the second possibility is to take the element above plus a gap score:
if ($traceback[$x-2][$y-1] eq "|") { $up = $array[$x-2][$y-1] + $gapextend_score; }
else { $up = $array[$x-2][$y-1] + $gapextend_score + $gapopen_score; }
# (iii) the third possibility is to take the element on the left plus a gap score:
if ($traceback[$x-1][$y-2] eq "-") { $left = $array[$x-1][$y-2] + $gapextend_score; }
else { $left = $array[$x-1][$y-2] + $gapextend_score + $gapopen_score;}
# record which of the three possibilities was highest, in @traceback:
if ($diag > $up && $diag > $left) { $traceback[$x-1][$y-1] = ">"; $max = $diag;}
elsif ($up > $diag && $up > $left) { $traceback[$x-1][$y-1] = "|"; $max = $up; }
elsif ($left > $diag && $left > $up) { $traceback[$x-1][$y-1] = "-"; $max = $left;}
elsif ($left == $diag && $up == $diag) { $traceback[$x-1][$y-1] = "*"; $max = $diag;}
elsif ($up == $left && $up > $diag && $left > $diag) { $traceback[$x-1][$y-1] = "L"; $max = $up; }
elsif ($up == $diag && $up > $left && $diag > $left) { $traceback[$x-1][$y-1] = "V"; $max = $up; }
elsif ($diag == $left && $diag > $up && $left > $up) { $traceback[$x-1][$y-1] = "Z"; $max = $diag;}
# record the highest score in @array:
$array[$x-1][$y-1] = $max;
}
}
# the best overlap is the given by the largest number in the right-most column or bottom-most
# row of the matrix @matrix:
$best_x = "-1";
$best_y = "-1";
$best_value = "-10000000000000";
for $x (1 .. $len_seqA + 1)
{ # For each row...
$value = $array[$x-1][$len_seqB];
if ($value > $best_value) { $best_x = $x-1; $best_y = $len_seqB; $best_value = $value;}
}
for $y (1 .. $len_seqB + 1)
{ # For each column...
$value = $array[$len_seqA][$y-1];
if ($value > $best_value) { $best_x = $len_seqA; $best_y = $y-1; $best_value = $value;}
}
# record the traceback as '#':
$seqA_aln = "";
$seqB_aln = "";
$tracebackvalue = $traceback[$best_x][$best_y];
while($tracebackvalue ne ".")
{
$traceback[$best_x][$best_y] = "#";
if ($tracebackvalue eq '>')
{
$seqA_letter = substr($seqA,$best_x-1,1);
$seqB_letter = substr($seqB,$best_y-1,1);
$best_x = $best_x - 1;
$best_y = $best_y - 1;
}
elsif ($tracebackvalue eq '-')
{
$seqA_letter = "-";
$seqB_letter = substr($seqB,$best_y-1,1);
$best_y = $best_y - 1;
}
elsif ($tracebackvalue eq '|')
{
$seqA_letter = substr($seqA,$best_x-1,1);
$seqB_letter = "-";
$best_x = $best_x - 1;
}
else { print STDERR "ERROR: tracebackvalue $tracebackvalue: cannot deal with branches in traceback\n"; exit; }
$seqA_aln = $seqA_letter.$seqA_aln;
$seqB_aln = $seqB_letter.$seqB_aln;
# find the new traceback value:
$tracebackvalue = $traceback[$best_x][$best_y];
}
# put $seqB in the first row of matrix @matrix:
for $y (1 .. $len_seqB + 1)
{ # For each column...
# find the letter at position $y-1 in $seqB:
if ($y > 1)
{
$seqB_letter = substr($seqB,($y-2),1);
$array[0][$y-1] = $seqB_letter;
}
}
# put $seqA in the first column of matrix @matrix:
for $x (1 .. $len_seqA + 1)
{ # For each row...
# find the letter at position $x-1 in $seqA:
if ($x > 1)
{
$seqA_letter = substr($seqA,($x-2),1);
$array[$x-1][0] = $seqA_letter;
}
}
# print out the result:
print "Overlap matrix, with traceback shown as #:\n";
for $x (1 .. $len_seqA + 1)
{ # For each row...
for $y (1 .. $len_seqB + 1)
{ # For each column...
$arrayvalue = $array[$x-1][$y-1];
$tracebackvalue = $traceback[$x-1][$y-1];
print "$arrayvalue $tracebackvalue\t";
}
print "\n";
}
print "\n";
# print out the overlap alignment:
print "Overlap alignment (score = $best_value):\n";
print "A: $seqA_aln\n";
print "B: $seqB_aln\n";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment