avrilcoghlan/map_introns_to_HMM.pl

## map_introns_to_HMM.pl
#!/usr/local/bin/perl

#
# Perl script map_introns_to_HMM.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 26-Apr-07.
#
# For the TreeFam project.
#
# This perl script reads in an alignment file corresponding
# to a HMM:
# (i) gets the positions of introns in each gene, by
#     reading them from the TreeFam mysql database,
# (ii) figures the positions of introns in the alignment,
# (iii) figures out the positions of introns in the HMM,
#     by using the mapping file produced by hmmbuild.
#
# The command-line format is:
# % perl <map_introns_to_HMM.pl> aln map output
# where aln is the input alignment file,
#       map is the mapping file produced by hmmbuild,
#       output is the output file name.
#
#------------------------------------------------------------------#

# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

$num_args                  = $#ARGV + 1;
if ($num_args != 3)
{
   print "Usage of map_introns_to_HMM.pl\n\n";
   print "perl map_introns_to_HMM.pl <aln> <map> <output>\n";
   print "where <aln> is the input alignment file,\n";
   print "      <map> is the mapping file produced by hmmbuild,\n";
   print "      <output> is the output file name.\n";
   print "For example, >perl map_introns_to_HMM.pl new_TF101002_fam.aln.pep\n";
   print "new_TF101002_fam.aln.pep.map new_TF101002_fam.introns\n";
   exit;
}

# READ IN MY PERL MODULES:

BEGIN {
    unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
}

use Avril_modules;
use DBI;

# FIND THE NAME OF THE ALIGNMENT FILE:

$aln                       = $ARGV[0];

# FIND THE NAME OF THE MAPPING FILE:

$map                       = $ARGV[1];

# FIND THE OUTPUT FILE NAME:

$output                    = $ARGV[2];
open(OUTPUT,">$output") || die "ERROR: cannot open $output.\n";

$VERBOSE                   = 0;

#------------------------------------------------------------------#

# READ IN THE INPUT ALIGNMENT FILE, TO GET THE NAMES OF THE PROTEINS:

($genes,$ALN_POS,$no_genes)= &read_aln($aln);

# READ IN THE MAPPING OF THE ALIGNMENT TO THE HMM:

$HMM_POS                   = &read_map($map);

# GET THE TREEFAM IDs FOR THESE GENES:

$TREEFAM_ID                = &get_treefam_ids($genes);

# GET THE POSITIONS OF INTRONS IN EACH OF THE GENES:

$all_introns               = &get_intron_positions($genes,$TREEFAM_ID,$ALN_POS,$HMM_POS);

# PRINT OUT THE INTRON POSITION FILE:

&print_intron_file($all_introns,$no_genes);
close(OUTPUT);

#------------------------------------------------------------------#

print STDERR "FINISHED.\n";

#------------------------------------------------------------------#

# SORT INTRONS BY POSITION IN THE HMM:

sub by_pos
{
   $POS{$a} <=> $POS{$b}
      or
   $a cmp $b;
}

#------------------------------------------------------------------#

# PRINT OUT THE INTRON POSITION FILE:

sub print_intron_file
{
   my $all_introns         = $_[0];
   my $no_genes            = $_[1];
   my @introns;
   my $intron;
   my $i;
   my @temp;
   my $pos;
   my $phase;
   %POS                    = ();
   my %SEEN                = ();
   my %COUNT               = ();
   my $count;
   my @introns2;
   my $fraction;
   my $last_intron;
   my $last_pos;
   my $intron0;
   my $intron1;
   my $intron2;
   my $count0;
   my $count1;
   my $count2;
   my $frac0;
   my $frac1;
   my $frac2;

   if ($no_genes <= 0) { print STDERR "ERROR: no_genes $no_genes.\n"; exit;}

   print OUTPUT "# intron position file\n";
   print OUTPUT "# columns: position phase 0 phase 1 phase 2\n";
   if ($all_introns ne '')
   {
      $all_introns         = substr($all_introns,1,length($all_introns)-1);
      @introns             = split(/\,/,$all_introns);
      for ($i = 0; $i <= $#introns; $i++)
      {
         $intron           = $introns[$i];
         @temp             = split(/_/,$intron);
         $pos              = $temp[0]; # POSITION OF INTRON WRT HMMER HMM.
         $POS{$intron}     = $pos;
         if (!($SEEN{$intron}))
         {
            $SEEN{$intron} = 1;
            $COUNT{$intron} = 1;
            @introns2      = (@introns2,$intron);
         }
         else
         {
            $COUNT{$intron}++;
         }
      }
      @introns2            = sort by_pos @introns2;
      $last_intron         = $introns2[$#introns2];
      @temp                = split(/_/,$last_intron);
      $last_pos            = $temp[0];
      # PRINT OUT THE INTRONS AT EACH POSITION IN THE HMM:
      for ($i = 1; $i <= $last_pos; $i++)
      {
         $intron0          = $i."_0";
         $intron1          = $i."_1";
         $intron2          = $i."_2";
         if ($COUNT{$intron0} || $COUNT{$intron1} || $COUNT{$intron2})
         {
            if ($COUNT{$intron0}) { $count0 = $COUNT{$intron0};} else { $count0 = 0;}
            if ($COUNT{$intron1}) { $count1 = $COUNT{$intron1};} else { $count1 = 0;}
            if ($COUNT{$intron2}) { $count2 = $COUNT{$intron2};} else { $count2 = 0;}
            $frac0         = $count0/$no_genes;
            $frac1         = $count1/$no_genes;
            $frac2         = $count2/$no_genes;
            print OUTPUT "$i $frac0 $frac1 $frac2\n";
         }
      }
   }
   else
   {
      print OUTPUT "no introns\n";
   }
}

#------------------------------------------------------------------#


# READ IN THE MAPPING OF THE ALIGNMENT TO THE HMM:

sub read_map
{
   my $map                 = $_[0];
   my $line;
   my @temp;
   my $aln;
   my $length;
   my $i;
   my $char;
   my $aln_pos             = 0;
   my $hmm_pos             = 0;
   my %HMM_POS             = ();

   open(MAP,"$map") || die "ERROR: cannot open $map.\n";
   while(<MAP>)
   {
      $line                = $_;
      chomp $line;
      if ($line =~ /#=GC RF/)
      {
         @temp             = split(/\s+/,$line);
         $aln              = $temp[2];
         $length           = length($aln);
         for ($i = 0; $i < $length; $i++)
         {
            $char          = substr($aln,$i,1);
            $aln_pos++;
            if ($char eq 'x')
            {
               $hmm_pos++;
            }
            # POSITION $aln_pos IN THE ALIGNMENT MAPS TO POSITION $hmm_pos IN THE HMM:
            if ($HMM_POS{$aln_pos}) { print STDERR "ERROR: already have hmm_pos for $aln_pos.\n"; exit;}
            $HMM_POS{$aln_pos} = $hmm_pos;
         }
      }
   }
   close(MAP);

   return(\%HMM_POS);
}

#------------------------------------------------------------------#

# GET THE TREEFAM IDs FOR THE GENES:

sub get_treefam_ids
{
   my $genes               = $_[0];
   my %ID                  = ();
   my $i;
   my $gene;
   my $dbh;
   my $table_w;
   my $st;
   my $sth;
   my $rv;
   my $id;
   my @array;
   my $rc;

   # FIND THE ID OF EACH GENE IN THE TREEFAM MYSQL DATABASE:
   $dbh                    = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return;
   # SPECIFY THE TABLE, FOR READING IN THE POSITIONS OF INTRONS:
   $table_w                = 'genes';
   for ($i = 0; $i < @$genes; $i++)
   {
      $gene                = $genes->[$i];
      $st                  = "SELECT ID from $table_w WHERE GID=?";
      $sth                 = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
      $rv                  = $sth->execute($gene) or die "Cannot execute the query: $sth->errstr";
      if ($rv >= 1)
      {
         while ((@array) = $sth->fetchrow_array)
         {
            $id            = $array[0];  # eg., ENSMUST00000049178.2
            $ID{$gene}     = $id;
         }
      }
      if (!($ID{$gene}))
      {
         $st               = "SELECT ID from $table_w WHERE TID=?";
         $sth              = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
         $rv               = $sth->execute($gene) or die "Cannot execute the query: $sth->errstr";
         if ($rv >= 1)
         {
            while ((@array) = $sth->fetchrow_array)
            {
               $id         = $array[0];  # eg., ENSMUST00000049178.2
               $ID{$gene}  = $id;
            }
         }
      }
   }

   $rc                     = $dbh->disconnect();
   $rc                     = "";

   return(\%ID);
}

#------------------------------------------------------------------#

# GET THE POSITIONS OF INTRONS IN EACH OF THE GENES:

sub get_intron_positions
{
   my $genes               = $_[0];
   my $TREEFAM_ID          = $_[1];
   my $ALN_POS             = $_[2];
   my $HMM_POS             = $_[3];
   my $i;
   my $gene;
   my $dbh;
   my $table_w;
   my $st;
   my $sth;
   my $rv;
   my $id;
   my @array;
   my $rc;
   my $strand;
   my $start;
   my $stop;
   my $introns;
   my $all_introns         = "";

   # FIND THE POSITION OF EACH GENE IN THE TREEFAM MYSQL DATABASE:
   $dbh                    = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return;
   # SPECIFY THE TABLE, FOR READING IN THE POSITIONS OF INTRONS:
   $table_w                = 'map';
   for ($i = 0; $i < @$genes; $i++)
   {
      $gene                = $genes->[$i];
      if (!($TREEFAM_ID->{$gene})) { print STDERR "ERROR: do not know id for gene $gene.\n"; exit;}
      $id                  = $TREEFAM_ID->{$gene};
      $st                  = "SELECT STRAND,START,STOP from $table_w WHERE ID=?";
      $sth                 = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
      $rv                  = $sth->execute($id) or die "Cannot execute the query: $sth->errstr";
      if ($rv >= 1)
      {
         while ((@array) = $sth->fetchrow_array)
         {
            $strand           = $array[0];  # eg., +
            $start            = $array[1];  # eg., 144225,144806,145261,145497,
            $stop             = $array[2];  # eg., 144318,145216,145444,145804,
            # FIND THE INTRONS IN THIS GENE:
            $introns          = &infer_intron_positions($strand,$start,$stop,$gene,$ALN_POS,$HMM_POS);
            if ($introns ne "") { $all_introns = $all_introns.",".$introns;}
         }
      }
   }
   $rc                     = $dbh->disconnect();
   $rc                     = "";

   return($all_introns);
}

#------------------------------------------------------------------#

# INFER THE INTRON POSITIONS IN A GENE:

sub infer_intron_positions
{
   my $strand              = $_[0];
   my $start               = $_[1]; # 144225,144806,145261,145497,
   my $stop                = $_[2]; # 144318,145216,145444,145804,
   my $gene                = $_[3];
   my $ALN_POS             = $_[4];
   my $HMM_POS             = $_[5];
   my @start               = split(/\,/,$start);
   my @stop                = split(/\,/,$stop);
   my $i;
   my $exon_start;
   my $exon_stop;
   my $coding_length;
   my $hangover;
   my $intron_phase;
   my $prev_aa_pos         = 0;
   my @temp;
   my $aln_pos;
   my $hmm_pos;
   my $intron;
   my $introns             = "";

   if ($#start != $#stop) { print STDERR "ERROR: start $#start stop $#stop.\n"; exit;}

   if ($VERBOSE == 1) { print "$gene-------------------------------------------------------\n";}
   if    ($strand eq '+')
   {
      $coding_length       = 0;
      for ($i = 0; $i <= $#start; $i++)
      {
         # FIND THE COORDINATES OF EXON $i:
         $exon_start       = $start[$i] + 1; # +1 TO CORRECT FOR THE WAY EXONS ARE STORED IN THE TREEFAM MYSQL DB.
         $exon_stop        = $stop[$i];
         # CALCULATE THE CODING LENGTH SO FAR:
         $coding_length    = $coding_length + ($exon_stop - $exon_start + 1);
         # WORK OUT THE POSITION OF THE PREVIOUS AMINO ACID:
         $prev_aa_pos      = $coding_length/3;
         @temp             = split(/\./,$prev_aa_pos);
         $prev_aa_pos      = $temp[0];
         # LEFT-OVER BASES AFTER THE CODING LENGTH SO FAR HAS BEEN TRANSLATED INTO CODONS:
         $hangover         = $coding_length % 3;
         if    ($hangover == 0) { $intron_phase = 0;}
         elsif ($hangover == 1) { $intron_phase = 1;} # IF 1 LEFT OVER, THE INTRON LIES AFTER THE 1ST BASE OF CODON (PHASE 1).
         elsif ($hangover == 2) { $intron_phase = 2;} # IF 2 LEFT OVER, THE INTRON LIES AFTER THE 2ND BASE OF CODON (PHASE 2).
         if ($i != $#start)
         {
            # FIND THE POSITION OF AMINO ACID $prev_aa_pos IN THE ALIGNMENT:
            if (!($ALN_POS->{$gene."_".$prev_aa_pos})) { print STDERR "ERROR: do not know aln pos of $gene $prev_aa_pos.\n"; exit;}
            $aln_pos       = $ALN_POS->{$gene."_".$prev_aa_pos};
            if ($HMM_POS->{$aln_pos}) # NOTE THAT THERE WILL NOT BE A HMM POS IF THE INTRON IS BEFORE THE HMM STARTS.
            {
               $hmm_pos       = $HMM_POS->{$aln_pos};
               $intron        = $hmm_pos."_".$intron_phase;
               $introns       = $introns.",".$intron;
               if ($VERBOSE == 1) { print "strand $strand : $exon_start $exon_stop and aa $prev_aa_pos aln_pos $aln_pos hmm_pos $hmm_pos intron_phase $intron_phase\n"; }
            }
         }
      }
   }
   elsif ($strand eq '-')
   {
      $coding_length       = 0;
      for ($i = $#start; $i >= 0; $i--)
      {
         # FIND THE COORDINATES OF EXON $i:
         $exon_start       = $start[$i] + 1; # +1 TO CORRECT FOR THE WAY EXONS ARE STORED IN THE TREEFAM MYSQL DB.
         $exon_stop        = $stop[$i];
         # CALCULATE THE CODING LENGTH SO FAR:
         $coding_length    = $coding_length + ($exon_stop - $exon_start + 1);
         # WORK OUT THE POSITION OF THE PREVIOUS AMINO ACID:
         $prev_aa_pos      = $coding_length/3;
         @temp             = split(/\./,$prev_aa_pos);
         $prev_aa_pos      = $temp[0];
         # LEFT-OVER BASES AFTER THE CODING LENGTH SO FAR HAS BEEN TRANSLATED INTO CODONS:
         $hangover         = $coding_length % 3;
         if    ($hangover == 0) { $intron_phase = 0;}
         elsif ($hangover == 1) { $intron_phase = 1;} # IF 1 LEFT OVER, THE INTRON LIES AFTER THE 1ST BASE OF CODON (PHASE 1).
         elsif ($hangover == 2) { $intron_phase = 2;} # IF 2 LEFT OVER, THE INTRON LIES AFTER THE 2ND BASE OF CODON (PHASE 2).
         if ($i != 0)
         {
            # FIND THE POSITION OF AMINO ACID $prev_aa_pos IN THE ALIGNMENT:
            if (!($ALN_POS->{$gene."_".$prev_aa_pos})) { print STDERR "ERROR: do not know aln pos of $gene $prev_aa_pos.\n"; exit;}
            $aln_pos       = $ALN_POS->{$gene."_".$prev_aa_pos};
            if ($HMM_POS->{$aln_pos}) # NOTE THAT THERE WILL NOT BE A HMM POS IF THE INTRON IS BEFORE THE HMM STARTS.
            {
               $hmm_pos       = $HMM_POS->{$aln_pos};
               $intron        = $hmm_pos."_".$intron_phase;
               $introns       = $introns.",".$intron;
               if ($VERBOSE == 1) { print "strand $strand : $exon_start $exon_stop and aa $prev_aa_pos aln_pos $aln_pos hmm_pos $hmm_pos intron_phase $intron_phase\n";}
            }
         }
      }
   }
   else
   {
      print STDERR "ERROR: strand $strand.\n"; exit;
   }

   if ($introns ne '') { $introns = substr($introns,1,length($introns)-1);}

   return($introns);
}

#------------------------------------------------------------------#

# READ IN THE INPUT ALIGNMENT FILE, TO GET THE NAMES OF THE PROTEINS:

sub read_aln
{
   my $aln                 = $_[0];
   my $line;
   my @temp;
   my $gene;
   my @genes;
   my $length;
   my $i;
   my $char;
   my $prot_pos;
   my $aln_pos;
   my %ALN_POS             = ();
   my $no_genes            = 0;

   open(ALN,"$aln") || die "ERROR: cannot open $aln.\n";
   while(<ALN>)
   {
      $line                = $_;
      chomp $line;
      if (substr($line,0,1) eq '>')
      {
         @temp             = split(/\s+/,$line);
         $gene             = $temp[0];
         $gene             = substr($gene,1,length($gene)-1);
         @genes            = (@genes,$gene);
         $no_genes++;
         $prot_pos         = 0;
         $aln_pos          = 0;
         print "xxx gene $gene\n";
      }
      else
      {
         $length           = length($line);
         for ($i = 0; $i < $length; $i++)
         {
            $char          = substr($line,$i,1);
            $aln_pos++;
            if ($char ne '-')
            {
               $prot_pos++;
               if ($ALN_POS{$gene."_".$prot_pos}) { print STDERR "ERROR: already have aln pos for $gene $prot_pos.\n"; exit;}
               $ALN_POS{$gene."_".$prot_pos} = $aln_pos;
            }
            print "xxx gene $gene aln_pos $aln_pos prot_pos $prot_pos\n";
         }
      }
   }
   close(ALN);

   return(\@genes,\%ALN_POS,$no_genes);
}

#------------------------------------------------------------------#
	#!/usr/local/bin/perl

	#
	# Perl script map_introns_to_HMM.pl
	# Written by Avril Coghlan (alc@sanger.ac.uk)
	# 26-Apr-07.
	#
	# For the TreeFam project.
	#
	# This perl script reads in an alignment file corresponding
	# to a HMM:
	# (i) gets the positions of introns in each gene, by
	# reading them from the TreeFam mysql database,
	# (ii) figures the positions of introns in the alignment,
	# (iii) figures out the positions of introns in the HMM,
	# by using the mapping file produced by hmmbuild.
	#
	# The command-line format is:
	# % perl <map_introns_to_HMM.pl> aln map output
	# where aln is the input alignment file,
	# map is the mapping file produced by hmmbuild,
	# output is the output file name.
	#
	#------------------------------------------------------------------#

	# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

	$num_args = $#ARGV + 1;
	if ($num_args != 3)
	{
	print "Usage of map_introns_to_HMM.pl\n\n";
	print "perl map_introns_to_HMM.pl <aln> <map> <output>\n";
	print "where <aln> is the input alignment file,\n";
	print " <map> is the mapping file produced by hmmbuild,\n";
	print " <output> is the output file name.\n";
	print "For example, >perl map_introns_to_HMM.pl new_TF101002_fam.aln.pep\n";
	print "new_TF101002_fam.aln.pep.map new_TF101002_fam.introns\n";
	exit;
	}

	# READ IN MY PERL MODULES:

	BEGIN {
	unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
	}

	use Avril_modules;
	use DBI;

	# FIND THE NAME OF THE ALIGNMENT FILE:

	$aln = $ARGV[0];

	# FIND THE NAME OF THE MAPPING FILE:

	$map = $ARGV[1];

	# FIND THE OUTPUT FILE NAME:

	$output = $ARGV[2];
	open(OUTPUT,">$output") \|\| die "ERROR: cannot open $output.\n";

	$VERBOSE = 0;

	#------------------------------------------------------------------#

	# READ IN THE INPUT ALIGNMENT FILE, TO GET THE NAMES OF THE PROTEINS:

	($genes,$ALN_POS,$no_genes)= &read_aln($aln);

	# READ IN THE MAPPING OF THE ALIGNMENT TO THE HMM:

	$HMM_POS = &read_map($map);

	# GET THE TREEFAM IDs FOR THESE GENES:

	$TREEFAM_ID = &get_treefam_ids($genes);

	# GET THE POSITIONS OF INTRONS IN EACH OF THE GENES:

	$all_introns = &get_intron_positions($genes,$TREEFAM_ID,$ALN_POS,$HMM_POS);

	# PRINT OUT THE INTRON POSITION FILE:

	&print_intron_file($all_introns,$no_genes);
	close(OUTPUT);

	#------------------------------------------------------------------#

	print STDERR "FINISHED.\n";

	#------------------------------------------------------------------#

	# SORT INTRONS BY POSITION IN THE HMM:

	sub by_pos
	{
	$POS{$a} <=> $POS{$b}
	or
	$a cmp $b;
	}

	#------------------------------------------------------------------#

	# PRINT OUT THE INTRON POSITION FILE:

	sub print_intron_file
	{
	my $all_introns = $_[0];
	my $no_genes = $_[1];
	my @introns;
	my $intron;
	my $i;
	my @temp;
	my $pos;
	my $phase;
	%POS = ();
	my %SEEN = ();
	my %COUNT = ();
	my $count;
	my @introns2;
	my $fraction;
	my $last_intron;
	my $last_pos;
	my $intron0;
	my $intron1;
	my $intron2;
	my $count0;
	my $count1;
	my $count2;
	my $frac0;
	my $frac1;
	my $frac2;

	if ($no_genes <= 0) { print STDERR "ERROR: no_genes $no_genes.\n"; exit;}

	print OUTPUT "# intron position file\n";
	print OUTPUT "# columns: position phase 0 phase 1 phase 2\n";
	if ($all_introns ne '')
	{
	$all_introns = substr($all_introns,1,length($all_introns)-1);
	@introns = split(/\,/,$all_introns);
	for ($i = 0; $i <= $#introns; $i++)
	{
	$intron = $introns[$i];
	@temp = split(/_/,$intron);
	$pos = $temp[0]; # POSITION OF INTRON WRT HMMER HMM.
	$POS{$intron} = $pos;
	if (!($SEEN{$intron}))
	{
	$SEEN{$intron} = 1;
	$COUNT{$intron} = 1;
	@introns2 = (@introns2,$intron);
	}
	else
	{
	$COUNT{$intron}++;
	}
	}
	@introns2 = sort by_pos @introns2;
	$last_intron = $introns2[$#introns2];
	@temp = split(/_/,$last_intron);
	$last_pos = $temp[0];
	# PRINT OUT THE INTRONS AT EACH POSITION IN THE HMM:
	for ($i = 1; $i <= $last_pos; $i++)
	{
	$intron0 = $i."_0";
	$intron1 = $i."_1";
	$intron2 = $i."_2";
	if ($COUNT{$intron0} \|\| $COUNT{$intron1} \|\| $COUNT{$intron2})
	{
	if ($COUNT{$intron0}) { $count0 = $COUNT{$intron0};} else { $count0 = 0;}
	if ($COUNT{$intron1}) { $count1 = $COUNT{$intron1};} else { $count1 = 0;}
	if ($COUNT{$intron2}) { $count2 = $COUNT{$intron2};} else { $count2 = 0;}
	$frac0 = $count0/$no_genes;
	$frac1 = $count1/$no_genes;
	$frac2 = $count2/$no_genes;
	print OUTPUT "$i $frac0 $frac1 $frac2\n";
	}
	}
	}
	else
	{
	print OUTPUT "no introns\n";
	}
	}

	#------------------------------------------------------------------#


	# READ IN THE MAPPING OF THE ALIGNMENT TO THE HMM:

	sub read_map
	{
	my $map = $_[0];
	my $line;
	my @temp;
	my $aln;
	my $length;
	my $i;
	my $char;
	my $aln_pos = 0;
	my $hmm_pos = 0;
	my %HMM_POS = ();

	open(MAP,"$map") \|\| die "ERROR: cannot open $map.\n";
	while(<MAP>)
	{
	$line = $_;
	chomp $line;
	if ($line =~ /#=GC RF/)
	{
	@temp = split(/\s+/,$line);
	$aln = $temp[2];
	$length = length($aln);
	for ($i = 0; $i < $length; $i++)
	{
	$char = substr($aln,$i,1);
	$aln_pos++;
	if ($char eq 'x')
	{
	$hmm_pos++;
	}
	# POSITION $aln_pos IN THE ALIGNMENT MAPS TO POSITION $hmm_pos IN THE HMM:
	if ($HMM_POS{$aln_pos}) { print STDERR "ERROR: already have hmm_pos for $aln_pos.\n"; exit;}
	$HMM_POS{$aln_pos} = $hmm_pos;
	}
	}
	}
	close(MAP);

	return(\%HMM_POS);
	}

	#------------------------------------------------------------------#

	# GET THE TREEFAM IDs FOR THE GENES:

	sub get_treefam_ids
	{
	my $genes = $_[0];
	my %ID = ();
	my $i;
	my $gene;
	my $dbh;
	my $table_w;
	my $st;
	my $sth;
	my $rv;
	my $id;
	my @array;
	my $rc;

	# FIND THE ID OF EACH GENE IN THE TREEFAM MYSQL DATABASE:
	$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') \|\| return;
	# SPECIFY THE TABLE, FOR READING IN THE POSITIONS OF INTRONS:
	$table_w = 'genes';
	for ($i = 0; $i < @$genes; $i++)
	{
	$gene = $genes->[$i];
	$st = "SELECT ID from $table_w WHERE GID=?";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute($gene) or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$id = $array[0]; # eg., ENSMUST00000049178.2
	$ID{$gene} = $id;
	}
	}
	if (!($ID{$gene}))
	{
	$st = "SELECT ID from $table_w WHERE TID=?";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute($gene) or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$id = $array[0]; # eg., ENSMUST00000049178.2
	$ID{$gene} = $id;
	}
	}
	}
	}

	$rc = $dbh->disconnect();
	$rc = "";

	return(\%ID);
	}

	#------------------------------------------------------------------#

	# GET THE POSITIONS OF INTRONS IN EACH OF THE GENES:

	sub get_intron_positions
	{
	my $genes = $_[0];
	my $TREEFAM_ID = $_[1];
	my $ALN_POS = $_[2];
	my $HMM_POS = $_[3];
	my $i;
	my $gene;
	my $dbh;
	my $table_w;
	my $st;
	my $sth;
	my $rv;
	my $id;
	my @array;
	my $rc;
	my $strand;
	my $start;
	my $stop;
	my $introns;
	my $all_introns = "";

	# FIND THE POSITION OF EACH GENE IN THE TREEFAM MYSQL DATABASE:
	$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') \|\| return;
	# SPECIFY THE TABLE, FOR READING IN THE POSITIONS OF INTRONS:
	$table_w = 'map';
	for ($i = 0; $i < @$genes; $i++)
	{
	$gene = $genes->[$i];
	if (!($TREEFAM_ID->{$gene})) { print STDERR "ERROR: do not know id for gene $gene.\n"; exit;}
	$id = $TREEFAM_ID->{$gene};
	$st = "SELECT STRAND,START,STOP from $table_w WHERE ID=?";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute($id) or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$strand = $array[0]; # eg., +
	$start = $array[1]; # eg., 144225,144806,145261,145497,
	$stop = $array[2]; # eg., 144318,145216,145444,145804,
	# FIND THE INTRONS IN THIS GENE:
	$introns = &infer_intron_positions($strand,$start,$stop,$gene,$ALN_POS,$HMM_POS);
	if ($introns ne "") { $all_introns = $all_introns.",".$introns;}
	}
	}
	}
	$rc = $dbh->disconnect();
	$rc = "";

	return($all_introns);
	}

	#------------------------------------------------------------------#

	# INFER THE INTRON POSITIONS IN A GENE:

	sub infer_intron_positions
	{
	my $strand = $_[0];
	my $start = $_[1]; # 144225,144806,145261,145497,
	my $stop = $_[2]; # 144318,145216,145444,145804,
	my $gene = $_[3];
	my $ALN_POS = $_[4];
	my $HMM_POS = $_[5];
	my @start = split(/\,/,$start);
	my @stop = split(/\,/,$stop);
	my $i;
	my $exon_start;
	my $exon_stop;
	my $coding_length;
	my $hangover;
	my $intron_phase;
	my $prev_aa_pos = 0;
	my @temp;
	my $aln_pos;
	my $hmm_pos;
	my $intron;
	my $introns = "";

	if ($#start != $#stop) { print STDERR "ERROR: start $#start stop $#stop.\n"; exit;}

	if ($VERBOSE == 1) { print "$gene-------------------------------------------------------\n";}
	if ($strand eq '+')
	{
	$coding_length = 0;
	for ($i = 0; $i <= $#start; $i++)
	{
	# FIND THE COORDINATES OF EXON $i:
	$exon_start = $start[$i] + 1; # +1 TO CORRECT FOR THE WAY EXONS ARE STORED IN THE TREEFAM MYSQL DB.
	$exon_stop = $stop[$i];
	# CALCULATE THE CODING LENGTH SO FAR:
	$coding_length = $coding_length + ($exon_stop - $exon_start + 1);
	# WORK OUT THE POSITION OF THE PREVIOUS AMINO ACID:
	$prev_aa_pos = $coding_length/3;
	@temp = split(/\./,$prev_aa_pos);
	$prev_aa_pos = $temp[0];
	# LEFT-OVER BASES AFTER THE CODING LENGTH SO FAR HAS BEEN TRANSLATED INTO CODONS:
	$hangover = $coding_length % 3;
	if ($hangover == 0) { $intron_phase = 0;}
	elsif ($hangover == 1) { $intron_phase = 1;} # IF 1 LEFT OVER, THE INTRON LIES AFTER THE 1ST BASE OF CODON (PHASE 1).
	elsif ($hangover == 2) { $intron_phase = 2;} # IF 2 LEFT OVER, THE INTRON LIES AFTER THE 2ND BASE OF CODON (PHASE 2).
	if ($i != $#start)
	{
	# FIND THE POSITION OF AMINO ACID $prev_aa_pos IN THE ALIGNMENT:
	if (!($ALN_POS->{$gene."_".$prev_aa_pos})) { print STDERR "ERROR: do not know aln pos of $gene $prev_aa_pos.\n"; exit;}
	$aln_pos = $ALN_POS->{$gene."_".$prev_aa_pos};
	if ($HMM_POS->{$aln_pos}) # NOTE THAT THERE WILL NOT BE A HMM POS IF THE INTRON IS BEFORE THE HMM STARTS.
	{
	$hmm_pos = $HMM_POS->{$aln_pos};
	$intron = $hmm_pos."_".$intron_phase;
	$introns = $introns.",".$intron;
	if ($VERBOSE == 1) { print "strand $strand : $exon_start $exon_stop and aa $prev_aa_pos aln_pos $aln_pos hmm_pos $hmm_pos intron_phase $intron_phase\n"; }
	}
	}
	}
	}
	elsif ($strand eq '-')
	{
	$coding_length = 0;
	for ($i = $#start; $i >= 0; $i--)
	{
	# FIND THE COORDINATES OF EXON $i:
	$exon_start = $start[$i] + 1; # +1 TO CORRECT FOR THE WAY EXONS ARE STORED IN THE TREEFAM MYSQL DB.
	$exon_stop = $stop[$i];
	# CALCULATE THE CODING LENGTH SO FAR:
	$coding_length = $coding_length + ($exon_stop - $exon_start + 1);
	# WORK OUT THE POSITION OF THE PREVIOUS AMINO ACID:
	$prev_aa_pos = $coding_length/3;
	@temp = split(/\./,$prev_aa_pos);
	$prev_aa_pos = $temp[0];
	# LEFT-OVER BASES AFTER THE CODING LENGTH SO FAR HAS BEEN TRANSLATED INTO CODONS:
	$hangover = $coding_length % 3;
	if ($hangover == 0) { $intron_phase = 0;}
	elsif ($hangover == 1) { $intron_phase = 1;} # IF 1 LEFT OVER, THE INTRON LIES AFTER THE 1ST BASE OF CODON (PHASE 1).
	elsif ($hangover == 2) { $intron_phase = 2;} # IF 2 LEFT OVER, THE INTRON LIES AFTER THE 2ND BASE OF CODON (PHASE 2).
	if ($i != 0)
	{
	# FIND THE POSITION OF AMINO ACID $prev_aa_pos IN THE ALIGNMENT:
	if (!($ALN_POS->{$gene."_".$prev_aa_pos})) { print STDERR "ERROR: do not know aln pos of $gene $prev_aa_pos.\n"; exit;}
	$aln_pos = $ALN_POS->{$gene."_".$prev_aa_pos};
	if ($HMM_POS->{$aln_pos}) # NOTE THAT THERE WILL NOT BE A HMM POS IF THE INTRON IS BEFORE THE HMM STARTS.
	{
	$hmm_pos = $HMM_POS->{$aln_pos};
	$intron = $hmm_pos."_".$intron_phase;
	$introns = $introns.",".$intron;
	if ($VERBOSE == 1) { print "strand $strand : $exon_start $exon_stop and aa $prev_aa_pos aln_pos $aln_pos hmm_pos $hmm_pos intron_phase $intron_phase\n";}
	}
	}
	}
	}
	else
	{
	print STDERR "ERROR: strand $strand.\n"; exit;
	}

	if ($introns ne '') { $introns = substr($introns,1,length($introns)-1);}

	return($introns);
	}

	#------------------------------------------------------------------#

	# READ IN THE INPUT ALIGNMENT FILE, TO GET THE NAMES OF THE PROTEINS:

	sub read_aln
	{
	my $aln = $_[0];
	my $line;
	my @temp;
	my $gene;
	my @genes;
	my $length;
	my $i;
	my $char;
	my $prot_pos;
	my $aln_pos;
	my %ALN_POS = ();
	my $no_genes = 0;

	open(ALN,"$aln") \|\| die "ERROR: cannot open $aln.\n";
	while(<ALN>)
	{
	$line = $_;
	chomp $line;
	if (substr($line,0,1) eq '>')
	{
	@temp = split(/\s+/,$line);
	$gene = $temp[0];
	$gene = substr($gene,1,length($gene)-1);
	@genes = (@genes,$gene);
	$no_genes++;
	$prot_pos = 0;
	$aln_pos = 0;
	print "xxx gene $gene\n";
	}
	else
	{
	$length = length($line);
	for ($i = 0; $i < $length; $i++)
	{
	$char = substr($line,$i,1);
	$aln_pos++;
	if ($char ne '-')
	{
	$prot_pos++;
	if ($ALN_POS{$gene."_".$prot_pos}) { print STDERR "ERROR: already have aln pos for $gene $prot_pos.\n"; exit;}
	$ALN_POS{$gene."_".$prot_pos} = $aln_pos;
	}
	print "xxx gene $gene aln_pos $aln_pos prot_pos $prot_pos\n";
	}
	}
	}
	close(ALN);

	return(\@genes,\%ALN_POS,$no_genes);
	}

	#------------------------------------------------------------------#