avrilcoghlan/find_intron_cons_treefam_ortholog.pl

## find_intron_cons_treefam_ortholog.pl
#!/usr/local/bin/perl

#
# Perl script find_intron_cons_treefam_ortholog.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 7-Dec-07.
#
# This perls scripts finds the fraction of introns in a C. elegans
# gene that are shared in position in the TreeFam ortholog in C. briggsae,
# human, yeast for each C. elegans gene in the ngasp benchmark gene set.
#
# The command-line format is:
# % perl <find_intron_cons_treefam_ortholog.pl> gff
# where gff is the input benchmark gff file.
#
#------------------------------------------------------------------#

# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

$num_args                  = $#ARGV + 1;
if ($num_args != 1)
{
   print "Usage of find_intron_cons_treefam_ortholog.pl\n\n";
   print "perl find_intron_cons_treefam_ortholog.pl <gff>\n";
   print "where <gff> is the input benchmark gff file.\n";
   print "For example, >perl find_intron_cons_treefam_ortholog.pl phase1_confirmed_isoforms_A.gff.1\n";
   exit;
}

# READ IN MY PERL MODULES:

BEGIN {
    unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
}

use Avril_modules;
use DBI;

# FIND THE NAME OF THE INPUT GFF FILE:

$gff                       = $ARGV[0];

$VERBOSE                   = 0;

#------------------------------------------------------------------#

# READ IN THE INPUT GFF FILE:

%TAKE                      = ();
open(GFF,"$gff") || die "ERROR: cannot open $gff.\n";
while(<GFF>)
{
   $line                   = $_;
   chomp $line;
   @temp                   = split(/\t+/,$line);
   $exon                   = $temp[8];

   # FIND THE GENE NAME:
   # eg. ID=CDS:Transcript:C49A1.4b.1;Name=C49A1.4b.1;Parent=Transcript:C49A1.4b.1
   @temp                   = split(/\;/,$exon);
   $gene                   = $temp[1]; # Name=C49A1.4b.1 THIS IS THE TRANSCRIPT NAME.
   @temp                   = split(/\./,$gene);
   $gene                   = $temp[0].".".$temp[1]; # eg. Name=C49A1.4b
   @temp                   = split(/=/,$gene);
   $gene                   = $temp[1]; # eg. C49A1.4b
   $gene                   =~ tr/[a-z]/[A-Z]/;
   if (substr($gene,length($gene)-1,1) =~ /[A-Z]/) { chop($gene);} # eg. C49A1.4 THIS IS THE GENE NAME.
   $TAKE{$gene}            = 1;
}
close(GFF);

#------------------------------------------------------------------#

# FIND THE TREEFAM ID FOR ALL THE GENES FROM C. ELEGANS THAT I AM INTERESTED IN:

# HASH TABLES TO REMEMBER THE SPECIES OF GENES:
$dbh                       = DBI->connect("dbi:mysql:treefam_5:db.treefam.org:3308", 'anonymous', '') || return;

# SPECIFY THE TABLE:
$table_w                   = 'genes';

# THE FIRST COLUMN IN THIS TABLE IS AN IDENTIFIER (IDX),
# ANOTHER COLUMN IS THE TRANSCRIPT NAME, AND THE LAST COLUMN
# IS THE TAXONOMY ID, eg. 1 ENSANGT00000032162.1 7165

$st                        = "SELECT IDX, TAX_ID, GID, ID from $table_w";
$sth                       = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv                        = $sth->execute or die "Cannot execute the query: $sth->errstr";
%IDX                       = ();
%ID                        = ();
%YEAST                     = ();
%CB                        = ();
%CR                        = ();
%HUMAN                     = ();
if ($rv >= 1)
{
   while ((@array) = $sth->fetchrow_array)
   {
      $IDX                 = $array[0];  # eg., 1
      $TAXID               = $array[1];  # eg 7165
      $GID                 = $array[2];
      $ID                  = $array[3];

      # CHECK IF THIS IS A C. ELEGANS GENE THAT WE WANT TO TAKE:
      if ($TAKE{$GID})
      {
         if (!($IDX{$GID})) { $IDX{$GID} = $IDX;}
         else {$IDX{$GID} = $IDX{$GID}.",".$IDX;}
         if (!($ID{$GID})) { $ID{$GID} = $ID;}
         else {$ID{$GID} = $ID{$GID}.",".$ID;}
      }
      # RECORD IF IT IS A YEAST GENE:
      if ($TAXID eq '4932')
      {
         $YEAST{$IDX}      = $GID;
         if (!($ID{$GID})) { $ID{$GID} = $ID;}
         else {$ID{$GID} = $ID{$GID}.",".$ID;}
      }
      # RECORD IF IT IS A C. BRIGGSAE GENE:
      if ($TAXID eq '6238')
      {
         $CB{$IDX}         = $GID;
         if (!($ID{$GID})) { $ID{$GID} = $ID;}
         else {$ID{$GID} = $ID{$GID}.",".$ID;}
      }
      # RECORD IF IT IS A C. REMANEI GENE:
      if ($TAXID eq '31234')
      {
         $CR{$IDX}         = $GID;
         if (!($ID{$GID})) { $ID{$GID} = $ID;}
         else {$ID{$GID} = $ID{$GID}.",".$ID;}
      }
      # RECORD IF IT IS A HUMAN GENE:
      if ($TAXID eq '9606')
      {
         $HUMAN{$IDX}      = $GID;
         if (!($ID{$GID})) { $ID{$GID} = $ID;}
         else {$ID{$GID} = $ID{$GID}.",".$ID;}
      }
   }
}

# NOW FIND THE FAMILIES FOR THE C. ELEGANS GENES OF INTEREST:
%FAM                       = ();
%SEEN                      = ();
foreach $GID (keys %TAKE)
{
   $IDS                    = $ID{$GID};
   @IDS                    = split(/\,/,$IDS);
   for ($i = 0; $i <= $#IDS; $i++)
   {
      $ID                  = $IDS[$i];
      # GET THE FAMILY FOR THE ID $ID:
      $table_w             = 'fam_genes';
      $st                  = "SELECT ID,AC,FLAG,FAM_TYPE from $table_w WHERE ID=?";
      $sth                 = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
      $rv                  = $sth->execute($ID) or die "Cannot execute the query: $sth->errstr";
      if ($rv >= 1)
      {
         while ((@array) = $sth->fetchrow_array)
         {
            $AC            = $array[1];
            $FLAG          = $array[2];
            $FAM_TYPE      = $array[3];
            if ($FLAG eq 'FULL' & ($FAM_TYPE eq 'A' || $FAM_TYPE eq 'B'))
            {
               if (!($SEEN{$GID."_".$AC}))
               {
                  if (!($FAM{$GID})) { $FAM{$GID} = $AC;                      }
                  else { print STDERR "$GID is in $FAM{$GID} and $AC\n"; exit;}
               }
            }
         }
      }
   }
}

# NOW FIND ALL THE ORTHOLOGS FOR THE C. ELEGANS GENES OF INTEREST:
%YEAST_ORTH                = ();
%CB_ORTH                   = ();
%CR_ORTH                   = ();
%HUMAN_ORTH                = ();
foreach $GID (keys %IDX)
{
   $IDXS                   = $IDX{$GID};
   @IDXS                   = split(/\,/,$IDXS);
   for ($i = 0; $i <= $#IDXS; $i++)
   {
      $IDX                 = $IDXS[$i];
      # GET ALL THE ORTHOLOGS FOR $IDX:
      $table_w             = 'ortholog';
      $st                  = "SELECT idx1,idx2 from $table_w WHERE idx1=?";
      $sth                 = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
      $rv                  = $sth->execute($IDX) or die "Cannot execute the query: $sth->errstr";
      if ($rv >= 1)
      {
         while ((@array) = $sth->fetchrow_array)
         {
            $idx1          = $array[0];
            $idx2          = $array[1];
            if ($YEAST{$idx2})
            {
               if (!($YEAST_ORTH{$GID})) { $YEAST_ORTH{$GID} = $YEAST{$idx2};}
               else {$YEAST_ORTH{$GID} = $YEAST_ORTH{$GID}.",".$YEAST{$idx2};}
            }
            if ($CB{$idx2})
            {
               if (!($CB_ORTH{$GID})) { $CB_ORTH{$GID} = $CB{$idx2};}
               else {$CB_ORTH{$GID} = $CB_ORTH{$GID}.",".$CB{$idx2};}
            }
            if ($CR{$idx2})
            {
               if (!($CR_ORTH{$GID})) { $CR_ORTH{$GID} = $CR{$idx2};}
               else {$CR_ORTH{$GID} = $CR_ORTH{$GID}.",".$CR{$idx2};}
            }
            if ($HUMAN{$idx2})
            {
               if (!($HUMAN_ORTH{$GID})) { $HUMAN_ORTH{$GID} = $HUMAN{$idx2};}
               else {$HUMAN_ORTH{$GID} = $HUMAN_ORTH{$GID}.",".$HUMAN{$idx2};}
            }
         }
      }
      $st                  = "SELECT idx1,idx2 from $table_w WHERE idx2=?";
      $sth                 = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
      $rv                  = $sth->execute($IDX) or die "Cannot execute the query: $sth->errstr";
      if ($rv >= 1)
      {
         while ((@array) = $sth->fetchrow_array)
         {
            $idx1          = $array[0];
            $idx2          = $array[1];
            if ($YEAST{$idx1})
            {
               if (!($YEAST_ORTH{$GID})) { $YEAST_ORTH{$GID} = $YEAST{$idx1};}
               else {$YEAST_ORTH{$GID} = $YEAST_ORTH{$GID}.",".$YEAST{$idx1};}
            }
            if ($CB{$idx1})
            {
               if (!($CB_ORTH{$GID})) { $CB_ORTH{$GID} = $CB{$idx1};}
               else {$CB_ORTH{$GID} = $CB_ORTH{$GID}.",".$CB{$idx1};}
            }
            if ($CR{$idx1})
            {
               if (!($CR_ORTH{$GID})) { $CR_ORTH{$GID} = $CR{$idx1};}
               else {$CR_ORTH{$GID} = $CR_ORTH{$GID}.",".$CR{$idx1};}
            }
            if ($HUMAN{$idx1})
            {
               if (!($HUMAN_ORTH{$GID})) { $HUMAN_ORTH{$GID} = $HUMAN{$idx1};}
               else {$HUMAN_ORTH{$GID} = $HUMAN_ORTH{$GID}.",".$HUMAN{$idx1};}
            }

         }
      }
   }
}

#------------------------------------------------------------------#

# READ IN THE INPUT GFF FILE:

print "GENE INTRONS CBORTH CBORTHID CBORTHINTRONS CRORTH CRORTHID CRORTHINTRONS HUMORTH HUMORTHID HUMORTHINTRONS YEASTORTH YEASTORTHID YEASTORTHINTRONS\n";
%SEEN                      = ();
open(GFF,"$gff") || die "ERROR: cannot open $gff.\n";
while(<GFF>)
{
   $line                   = $_;
   chomp $line;
   @temp                   = split(/\t+/,$line);
   $exon                   = $temp[8];

   # FIND THE GENE NAME:
   # eg. ID=CDS:Transcript:C49A1.4b.1;Name=C49A1.4b.1;Parent=Transcript:C49A1.4b.1
   @temp                   = split(/\;/,$exon);
   $gene                   = $temp[1]; # Name=C49A1.4b.1 THIS IS THE TRANSCRIPT NAME.
   @temp                   = split(/\./,$gene);
   $gene                   = $temp[0].".".$temp[1]; # eg. Name=C49A1.4b
   @temp                   = split(/=/,$gene);
   $gene                   = $temp[1]; # eg. C49A1.4b
   $gene                   =~ tr/[a-z]/[A-Z]/;
   if (substr($gene,length($gene)-1,1) =~ /[A-Z]/) { chop($gene);} # eg. C49A1.4 THIS IS THE GENE NAME.

   # SEE IF THIS GENE HAS C. BRIGGSAE, C. REMANEI, HUMAN AND YEAST ORTHOLOGS:
   if ($CB_ORTH{$gene}) { $cb_orth = $CB_ORTH{$gene};}
   else                 { $cb_orth = "none";         }
   if ($CR_ORTH{$gene}) { $cr_orth = $CR_ORTH{$gene};}
   else                 { $cr_orth = "none";         }
   if ($HUMAN_ORTH{$gene}) { $human_orth = $HUMAN_ORTH{$gene};}
   else                    { $human_orth = "none";            }
   if ($YEAST_ORTH{$gene}) { $yeast_orth = $YEAST_ORTH{$gene};}
   else                    { $yeast_orth = "none";            }
   if ($VERBOSE == 1) { print "gene $gene cb_orth $cb_orth cr_orth $cr_orth human_orth $human_orth yeast_orth $yeast_orth\n";}
   if (!($SEEN{$gene}))
   {
      $SEEN{$gene}         = 1;
      # GET THE FAMILY THAT THIS GENE IS FOUND IN:
      if ($FAM{$gene})
      {
         $family           = $FAM{$gene};
         if ($VERBOSE == 1) { print "C. elegans gene $gene is in family $family\n";}
         # GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND C. BRIGGSAE ORTHOLOGS IN THIS FAMILY:
         # ALSO GET THE FRACTION OF INTRONS CONSERVED.
         if ($cb_orth ne 'none')
         {
            ($cb_orth_id,$cb_orth_introns) = &get_pc_id($gene,$cb_orth,$family);
         }
         else { $cb_orth_id = 0; $cb_orth_introns = 0;}
         # GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND C. REMANEI ORTHOLOGS IN THIS FAMILY:
         if ($cr_orth ne 'none')
         {
            ($cr_orth_id,$cr_orth_introns) = &get_pc_id($gene,$cr_orth,$family);
         }
         else { $cr_orth_id = 0; $cr_orth_introns = 0;}
         # GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND HUMAN ORTHOLOGS IN THIS FAMILY:
         if ($human_orth ne 'none')
         {
            ($human_orth_id,$human_orth_introns) = &get_pc_id($gene,$human_orth,$family);
         }
         else { $human_orth_id = 0; $human_orth_introns = 0;}
         # GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND YEAST ORTHOLOGS IN THIS FAMILY:
         if ($yeast_orth ne 'none')
         {
            ($yeast_orth_id,$yeast_orth_introns) = &get_pc_id($gene,$yeast_orth,$family);
         }
         else { $yeast_orth_id = 0; $yeast_orth_introns = 0;}
      }
      else # THE C. ELEGANS GENE IS NOT IN A FAMILY.
      {
         if ($cb_orth ne 'none' || $cr_orth ne 'none' || $human_orth ne 'none' || $yeast_orth ne 'none')
         {
            # xxx this is an error in treefam-5 mysql I think.
            print STDERR "WARNING: $gene has orthologs but is not in a family\n";
         }
         $cr_orth_id       = 0;
         $cr_orth_introns  = 0;
         $cb_orth_id       = 0;
         $cb_orth_introns  = 0;
         $human_orth_id    = 0;
         $human_orth_introns = 0;
         $yeast_orth_id    = 0;
         $yeast_orth_introns = 0;
      }
      $introns             = &get_num_introns($gene);
      if ($gene eq '')    { print STDERR "ERROR: gene $gene\n"; exit;}
      if ($introns eq '') { print STDERR "ERROR: gene $gene introns $introns\n"; exit;}
      if ($cb_orth eq '') { print STDERR "ERROR: gene $gene cb_orth $cb_orth\n"; exit;}
      if ($cb_orth_id eq '') { print STDERR "ERROR: gene $gene cb_orth_id $cb_orth_id\n"; exit;}
      if ($cb_orth_introns eq '') { print STDERR "ERROR: gene $gene cb_orth_introns $cb_orth_introns cb_orth $cb_orth cb_orth_id $cb_orth_id\n"; exit;}
      if ($cr_orth eq '') { print STDERR "ERROR: gene $gene cr_orth $cr_orth\n"; exit;}
      if ($cr_orth_id eq '') { print STDERR "ERROR: gene $gene cr_orth_id $cr_orth_id\n"; exit;}
      if ($cr_orth_introns eq '') { print STDERR "ERROR: gene $gene cr_orth_introns $cr_orth_introns\n"; exit;}
      if ($human_orth eq '') { print STDERR "ERROR: gene $gene human_orth $human_orth\n"; exit;}
      if ($human_orth_id eq '') { print STDERR "ERROR: gene $gene human_orth_id $human_orth_id\n"; exit;}
      if ($human_orth_introns eq '') { print STDERR "ERROR: gene $gene human_orth_introns $human_orth_introns\n"; exit;}
      if ($yeast_orth eq '') { print STDERR "ERROR: gene $gene yeast_orth $yeast_orth\n"; exit;}
      if ($yeast_orth_id eq '') { print STDERR "ERROR: gene $gene yeast_orth_id $yeast_orth_id\n"; exit;}
      if ($yeast_orth_introns eq '') { print STDERR "ERROR: gene $gene yeast_orth_introns $yeast_orth_introns\n"; exit;}

      print "$gene $introns $cb_orth $cb_orth_id $cb_orth_introns $cr_orth $cr_orth_id $cr_orth_introns $human_orth $human_orth_id $human_orth_introns $yeast_orth $yeast_orth_id $yeast_orth_introns\n";
   }
}
close(GFF);

#------------------------------------------------------------------#

print STDERR "FINISHED.\n";

#------------------------------------------------------------------#

# GET THE NUMBER OF INTRONS IN A C. ELEGANS GENE:

sub get_num_introns
{
   my $gene                = $_[0];
   my $dbh;
   my $table_w;
   my $st;
   my $sth;
   my $rv;
   my @array;
   my $start               = "none";
   my $IDS;
   my @IDS;
   my $i;
   my $ID;
   my $num_exons;
   my @temp;
   my $num_introns;
   my $max_num_introns     = 0;

   # FIND THE POSITION OF EACH GENE IN THE TREEFAM MYSQL DATABASE:
   $dbh                    = DBI->connect("dbi:mysql:treefam_5:db.treefam.org:3308", 'anonymous', '') || return;

   # GET THE NUMBER OF INTRONS IN EACH TRANSCRIPT OF $gene:
   if (!($ID{$gene})) { print STDERR "ERROR: do not know ids for $gene\n"; exit;}
   $IDS                    = $ID{$gene};
   @IDS                    = split(/\,/,$IDS);
   for ($i = 0; $i <= $#IDS; $i++)
   {
      $start               = "none";
      $ID                  = $IDS[$i];
      # SPECIFY THE TABLE, FOR READING IN THE POSITIONS OF INTRONS:
      $table_w             = 'map';
      $st                  = "SELECT START from $table_w WHERE ID=?";
      $sth                 = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
      $rv                  = $sth->execute($ID) or die "Cannot execute the query: $sth->errstr";
      if ($rv >= 1)
      {
         while ((@array) = $sth->fetchrow_array)
         {
            $start         = $array[0];  # eg., 144225,144806,145261,145497,
            chop($start);                # 144225,144806,145261,145497
         }
      }
      if ($start eq 'none') { print STDERR "ERROR: did not find position of introns in $gene\n"; exit;}
      @temp                = split(/\,/,$start);
      $num_exons           = $#temp + 1;
      $num_introns         = $num_exons - 1;
      if ($num_introns > $max_num_introns) { $max_num_introns = $num_introns;}
   }

   return($max_num_introns);
}

#------------------------------------------------------------------#

# GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND C. BRIGGSAE ORTHOLOGS IN THIS FAMILY:

sub get_pc_id
{
   my $gene                = $_[0];
   my $orths               = $_[1];
   my $family              = $_[2];
   my $IDS;
   my @IDS;
   my $i;
   my $ID;
   my $table_w;
   my $st;
   my $sth;
   my $rv;
   my @array;
   my $CIGAR1              = "none";
   my $CIGAR2              = "none";
   my @orths;
   my $orth;
   my $j;
   my $the_id1;
   my $the_id2;
   my $seq1;
   my $seq2;
   my $pc_id;
   my $max_pc_id           = 0;
   my $intron_cons         = 0;
   my $max_intron_cons     = 0;
   my $introns1;
   my $introns2;
   my %ALN_POS             = ();

   # GET THE ALIGNMENT FOR GENE $gene IN FAMILY $family:
   if (!($ID{$gene})) { print STDERR "ERROR: do not know ids for $gene\n"; exit;}
   $IDS                    = $ID{$gene};
   @IDS                    = split(/\,/,$IDS);
   if ($VERBOSE == 1) { print "C. elegans gene $gene has ids $IDS\n";}
   for ($i = 0; $i <= $#IDS; $i++)
   {
      $ID                  = $IDS[$i];
      # GET THE ALIGNMENT FOR THE ID $ID IN FAMILY $family:
      $table_w             = 'aa_full_align';
      $st                  = "SELECT ID,CIGAR from $table_w WHERE AC=?";
      $sth                 = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
      $rv                  = $sth->execute($family) or die "Cannot execute the query: $sth->errstr";
      if ($rv >= 1)
      {
         while ((@array) = $sth->fetchrow_array)
         {
            if ($array[0] eq $ID)
            {
               if ($CIGAR1 ne 'none') { print STDERR "ERROR: already have cigar for $ID $gene $family\n"; exit;}
               $CIGAR1     = $array[1];
               $the_id1    = $ID;
            }
         }
      }
   }
   if ($CIGAR1 eq 'none')
   {
      print STDERR "WARNING: did not find cigar for IDS $IDS gene $gene $family\n";
      return($max_pc_id,$max_intron_cons); #xxx probelm with treefam5
      #xxx exit;
   }

   # GET THE SEQUENCE FOR $the_id1:
   $table_w                = 'aa_seq';
   $st                     = "SELECT SEQ from $table_w WHERE ID=?";
   $sth                    = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
   $rv                     = $sth->execute($the_id1) or die "Cannot execute the query: $sth->errstr";
   $seq1                   = "none";
   if ($rv >= 1)
   {
      while ((@array)= $sth->fetchrow_array)
      {
         $seq1             = $array[0];
      }
   }
   if ($seq1 eq 'none') { print STDERR "ERROR: seq1 $seq1 for $the_id1 $gene\n"; exit;}

   # GET THE POSITIONS OF EACH AMINO ACID IN THE ALIGNMENT:
   &read_aln(\%ALN_POS,$the_id1,$CIGAR1,$seq1);

   # GET THE POSITIONS OF INTRONS IN $the_id1:
   $introns1               = &get_intron_positions($the_id1,\%ALN_POS);

   # GET THE ALIGNMENT FOR THE ORTHOLOGS $orths IN FAMILY $family:
   @orths                  = split(/\,/,$orths);
   for ($i = 0; $i <= $#orths; $i++)
   {
      $orth                = $orths[$i];
      if (!($ID{$orth})) { print STDERR "ERROR: do not know ids for $orth\n"; exit;}
      $IDS                 = $ID{$orth};
      @IDS                 = split(/\,/,$IDS);
      $CIGAR2              = "none";
      if ($VERBOSE == 1) { print "Ortholog $orth has ids $IDS\n";}
      for ($j = 0; $j <= $#IDS; $j++)
      {
         $ID               = $IDS[$j];
         if ($VERBOSE == 1) { print "Looking for cigar for $ID (orth $orth)\n";}
         # GET THE ALIGNMENT FOR THE ID $ID IN FAMILY $family:
         $table_w          = 'aa_full_align';
         $st               = "SELECT ID,CIGAR from $table_w WHERE AC=?";
         $sth              = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
         $rv               = $sth->execute($family) or die "Cannot execute the query: $sth->errstr";
         if ($rv >= 1)
         {
            while ((@array)= $sth->fetchrow_array)
            {
               if ($array[0] eq $ID)
               {
                  if ($CIGAR2 ne 'none') { print STDERR "ERROR: already have cigar for $ID $gene $family\n"; exit;}
                  $CIGAR2  = $array[1];
                  $the_id2 = $ID;
                  if ($VERBOSE == 1) { print "Got aln for $the_id2 family $family\n";}
               }
            }
         }
      }
      if ($CIGAR2 eq 'none')
      {
         print STDERR "WARNING: did not find cigar for IDS $IDS orth $orth $family (gene $gene)\n";
         #xxx exit; #xxx problem with treefam5
         return($max_pc_id,$max_intron_cons);
      }

      # GET THE SEQUENCE FOR $the_id2:
      $table_w             = 'aa_seq';
      $st                  = "SELECT SEQ from $table_w WHERE ID=?";
      $sth                 = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
      $rv                  = $sth->execute($the_id2) or die "Cannot execute the query: $sth->errstr";
      $seq2                = "none";
      if ($rv >= 1)
      {
         while ((@array)= $sth->fetchrow_array)
         {
            $seq2          = $array[0];
         }
      }
      if ($seq2 eq 'none') { print STDERR "ERROR: seq2 $seq2 for $the_id2 $orth\n"; exit;}

      # GET THE POSITIONS OF EACH AMINO ACID IN THE ALIGNMENT:
      &read_aln(\%ALN_POS,$the_id2,$CIGAR2,$seq2);

      # GET THE POSITIONS OF INTRONS IN $the_id2:
      $introns2            = &get_intron_positions($the_id2,\%ALN_POS);

      # GET THE PERCENT IDENTITY BETWEEN $gene AND $orth:
      $pc_id               = &get_pc_id_for_pair($CIGAR1,$CIGAR2,$seq1,$seq2);
      if ($pc_id > $max_pc_id) { $max_pc_id = $pc_id;}

      # GET THE FRACTION OF INTRONS THAT ARE CONSERVED BETWEEN $gene AND $orth:
      $intron_cons         = &get_fraction_cons_introns($CIGAR1,$CIGAR2,$seq1,$seq2,$introns1,$introns2);
      if ($intron_cons > $max_intron_cons) { $max_intron_cons = $intron_cons;}
   }

   return($max_pc_id,$max_intron_cons);
}

#------------------------------------------------------------------#

# GET THE FRACTION OF INTRONS THAT ARE CONSERVED BETWEEN $gene AND $orth:

sub get_fraction_cons_introns
{
   my $CIGAR1              = $_[0];
   my $CIGAR2              = $_[1];
   my $seq1                = $_[2];
   my $seq2                = $_[3];
   my $introns1            = $_[4];
   my $introns2            = $_[5];
   my $intron_cons         = 0;
   my $num_introns1;
   my @temp;
   my @introns1;
   my @introns2;
   my %INTRON1             = ();
   my $i;
   my $pos;
   my $num_cons;

   # FIND OUT HOW MANY INTRONS ARE IN $introns1:
   @introns1               = split(/\,/,$introns1);
   $num_introns1           = $#introns1 + 1;
   @introns2               = split(/\,/,$introns2);

   # SEE HOW MANY OF THE INTRONS IN $introns1 ARE IN $introns2:
   for ($i = 0; $i <= $#introns1; $i++)
   {
      @temp                = split(/_/,$introns1[$i]); # eg. C37A5.9.1_200_0
      $pos                 = $temp[1];
      $INTRON1{$pos}       = 1;
   }
   $num_cons               = 0;
   for ($i = 0; $i <= $#introns2; $i++)
   {
      @temp                = split(/_/,$introns2[$i]);
      $pos                 = $temp[1];
      if ($INTRON1{$pos}) { $num_cons++;}
   }

   if ($num_introns1 == 0)
   {
      if ($num_cons > 0) { print STDERR "ERROR: num_introns1 $num_introns1 num_cons $num_cons\n"; exit;}
      $intron_cons         = 0;
   }
   else
   {
      $intron_cons         = ($num_cons*100)/($num_introns1);
   }

   return($intron_cons);
}

#------------------------------------------------------------------#

# GET THE POSITIONS OF EACH AMINO ACID IN THE ALIGNMENT:

sub read_aln
{
   my $ALN_POS             = $_[0];
   my $id                  = $_[1];
   my $CIGAR               = $_[2];
   my $seq                 = $_[3];
   my $num;
   my $aln;
   my $offset;
   my $i;
   my $char;
   my $j;
   my $char2;
   my $prot_pos;
   my $aln_pos;
   my $length;

   # MAKE THE ALIGNMENT FOR $id:
   $num                    = "";
   $aln                    = "";
   $offset                 = 0;
   for ($i = 1; $i <= length($CIGAR); $i++)
   {
      $char                = substr($CIGAR,$i-1,1);
      if ($char =~ /[0-9]/) { $num = $num.$char;}
      else
      {
         if ($char ne 'M' & $char ne 'D') { print STDERR "ERROR: char $char\n"; exit;}
         if ($char eq 'D')
         {
            for ($j = 1; $j <= $num; $j++) { $aln = $aln."-";}
         }
         elsif ($char eq 'M')
         {
            for ($j = 1; $j <= $num; $j++)
            {
               $offset++;
               if ($offset > length($seq))
               {
                  print STDERR "ERROR: offset $offset seq $seq\n";
                  exit;
               }
               $char2      = substr($seq,$offset-1,1);
               $aln       = $aln.$char2;
            }
         }
         $num              = "";
      }
   }

   # NOW RECORD THE POSITIONS IN THE ALIGNMENT:
   $prot_pos               = 0;
   $aln_pos                = 0;
   $length                 = length($aln);
   for ($i = 0; $i < $length; $i++)
   {
      $char                = substr($aln,$i,1);
      $aln_pos++;
      if ($char ne '-')
      {
         $prot_pos++;
         if ($ALN_POS->{$id."_".$prot_pos}) { print STDERR "ERROR: already have aln pos for $id $prot_pos.\n"; exit;}
         $ALN_POS->{$id."_".$prot_pos} = $aln_pos;
      }
   }
}

#------------------------------------------------------------------#

# GET THE POSITIONS OF INTRONS IN A GENE:

sub get_intron_positions
{
   my $id                  = $_[0];
   my $ALN_POS             = $_[1];
   my $dbh;
   my $table_w;
   my $st;
   my $sth;
   my $rv;
   my @array;
   my $strand;
   my $start;
   my $stop;
   my $introns;
   my $rc;
   my $cstart;
   my $cstop;

   # FIND THE POSITION OF EACH GENE IN THE TREEFAM MYSQL DATABASE:
   $dbh                    = DBI->connect("dbi:mysql:treefam_5:db.treefam.org:3308", 'anonymous', '') || return;
   # SPECIFY THE TABLE, FOR READING IN THE POSITIONS OF INTRONS:
   $table_w                = 'map';
   $st                     = "SELECT STRAND,START,STOP,C_START,C_STOP from $table_w WHERE ID=?";
   $sth                    = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
   $rv                     = $sth->execute($id) or die "Cannot execute the query: $sth->errstr";
   if ($rv >= 1)
   {
      while ((@array) = $sth->fetchrow_array)
      {
         $strand           = $array[0];  # eg., +
         $start            = $array[1];  # eg., 144225,144806,145261,145497,
         $stop             = $array[2];  # eg., 144318,145216,145444,145804,
         $cstart           = $array[3];
         $cstop            = $array[4];
         $start            = &remove_noncoding($start,$cstart,$cstop,"start");
         $stop             = &remove_noncoding($stop,$cstart,$cstop,"stop");
         # FIND THE INTRONS IN THIS GENE:
         $introns          = &infer_intron_positions($strand,$start,$stop,$id,$ALN_POS);
      }
   }

   return($introns);
}

#------------------------------------------------------------------#

# INFER THE INTRON POSITIONS IN A GENE:

sub infer_intron_positions
{
   my $strand              = $_[0];
   my $start               = $_[1]; # 144225,144806,145261,145497,
   my $stop                = $_[2]; # 144318,145216,145444,145804,
   my $gene                = $_[3];
   my $ALN_POS             = $_[4];
   my @start               = split(/\,/,$start);
   my @stop                = split(/\,/,$stop);
   my $i;
   my $exon_start;
   my $exon_stop;
   my $coding_length;
   my $hangover;
   my $intron_phase;
   my $prev_aa_pos         = 0;
   my @temp;
   my $aln_pos;
   my $hmm_pos;
   my $intron;
   my $introns             = "";

   if ($#start != $#stop)
   {
      print STDERR "ERROR: gene $gene : start $#start ($start) stop $#stop ($stop).\n";
      exit;
   }

   if ($VERBOSE == 1) { print "$gene-------------------------------------------------------\n";}
   if ($VERBOSE == 1) { print "gene $gene : start $start stop $stop\n";}
   if    ($strand eq '+')
   {
      $coding_length       = 0;
      for ($i = 0; $i <= $#start; $i++)
      {
         # FIND THE COORDINATES OF EXON $i:
         $exon_start       = $start[$i] + 1; # +1 TO CORRECT FOR THE WAY EXONS ARE STORED IN THE TREEFAM MYSQL DB.
         $exon_stop        = $stop[$i];
         # CALCULATE THE CODING LENGTH SO FAR:
         $coding_length    = $coding_length + ($exon_stop - $exon_start + 1);
         # WORK OUT THE POSITION OF THE PREVIOUS AMINO ACID:
         $prev_aa_pos      = $coding_length/3;
         @temp             = split(/\./,$prev_aa_pos);
         $prev_aa_pos      = $temp[0];
         # LEFT-OVER BASES AFTER THE CODING LENGTH SO FAR HAS BEEN TRANSLATED INTO CODONS:
         $hangover         = $coding_length % 3;
         if    ($hangover == 0) { $intron_phase = 0;}
         elsif ($hangover == 1) { $intron_phase = 1;} # IF 1 LEFT OVER, THE INTRON LIES AFTER THE 1ST BASE OF CODON (PHASE 1).
         elsif ($hangover == 2) { $intron_phase = 2;} # IF 2 LEFT OVER, THE INTRON LIES AFTER THE 2ND BASE OF CODON (PHASE 2).
         if ($i != $#start)
         {
            # FIND THE POSITION OF AMINO ACID $prev_aa_pos IN THE ALIGNMENT:
            if (!($ALN_POS->{$gene."_".$prev_aa_pos})) { print STDERR "ERROR: do not know aln pos of $gene $prev_aa_pos.\n"; exit;}
            $aln_pos       = $ALN_POS->{$gene."_".$prev_aa_pos};
            $intron        = $gene."_".$aln_pos."_".$intron_phase;
            $introns       = $introns.",".$intron;
            if ($VERBOSE == 1) { print "strand $strand : $exon_start $exon_stop and aa $prev_aa_pos aln_pos $aln_pos intron_phase $intron_phase\n";}
         }
      }
   }
   elsif ($strand eq '-')
   {
      $coding_length       = 0;
      for ($i = $#start; $i >= 0; $i--)
      {
         # FIND THE COORDINATES OF EXON $i:
         $exon_start       = $start[$i] + 1; # +1 TO CORRECT FOR THE WAY EXONS ARE STORED IN THE TREEFAM MYSQL DB.
         $exon_stop        = $stop[$i];
         # CALCULATE THE CODING LENGTH SO FAR:
         $coding_length    = $coding_length + ($exon_stop - $exon_start + 1);
         # WORK OUT THE POSITION OF THE PREVIOUS AMINO ACID:
         $prev_aa_pos      = $coding_length/3;
         @temp             = split(/\./,$prev_aa_pos);
         $prev_aa_pos      = $temp[0];
         # LEFT-OVER BASES AFTER THE CODING LENGTH SO FAR HAS BEEN TRANSLATED INTO CODONS:
         $hangover         = $coding_length % 3;
         if    ($hangover == 0) { $intron_phase = 0;}
         elsif ($hangover == 1) { $intron_phase = 1;} # IF 1 LEFT OVER, THE INTRON LIES AFTER THE 1ST BASE OF CODON (PHASE 1).
         elsif ($hangover == 2) { $intron_phase = 2;} # IF 2 LEFT OVER, THE INTRON LIES AFTER THE 2ND BASE OF CODON (PHASE 2).
         if ($i != 0)
         {
            # FIND THE POSITION OF AMINO ACID $prev_aa_pos IN THE ALIGNMENT:
            if (!($ALN_POS->{$gene."_".$prev_aa_pos})) { print STDERR "ERROR: do not know aln pos of $gene $prev_aa_pos.\n"; exit;}
            $aln_pos       = $ALN_POS->{$gene."_".$prev_aa_pos};
            $intron        = $gene."_".$aln_pos."_".$intron_phase;
            $introns       = $introns.",".$intron;
            if ($VERBOSE == 1) { print "strand $strand : $exon_start $exon_stop and aa $prev_aa_pos aln_pos $aln_pos intron_phase $intron_phase\n";}
         }
      }
   }
   else
   {
      print STDERR "ERROR: strand $strand.\n"; exit;
   }

   if ($introns ne '') { $introns = substr($introns,1,length($introns)-1);}

   return($introns);
}

#------------------------------------------------------------------#

# SUBROUTINE TO REMOVE NON-CODING REGIONS FROM THE EXONS:

sub remove_noncoding
{
   my $exons               = $_[0];
   my $cstart              = $_[1];
   my $cstop               = $_[2];
   my $type                = $_[3];
   my $new_exons           = "";
   chop($exons);
   my @exons               = split(/\,/,$exons);
   my $i;
   my $seen_start          = 0;
   my $seen_end            = 0;

   for ($i = 0; $i <= $#exons; $i++)
   {
      if    ($exons[$i] >= $cstart && $exons[$i] <= $cstop)
      {
         if    ($i == 0)       { $seen_start = 1;}
         elsif ($i == $#exons) { $seen_end   = 1;}
         $new_exons        = $new_exons.",".$exons[$i];
         if ($exons[$i] == $cstart && $seen_start == 0) { $seen_start = 1;}
         if ($exons[$i] == $cstop  && $seen_end == 0)   { $seen_end   = 1;}
      }
      elsif ($exons[$i] <= $cstart)
      {
         # ignore this exon.
      }
      elsif ($exons[$i] >= $cstop)
      {
         # ignore this exon.
      }
   }
   if ($new_exons ne '')
   {
      $new_exons           = substr($new_exons,1,length($new_exons)-1);
      if ($seen_start == 0 && $type eq 'start') { $new_exons = $cstart.",".$new_exons;}
      if ($seen_end   == 0 && $type eq 'stop')  { $new_exons = $new_exons.",".$cstop; }
   }
   else
   {
      # THERE IS JUST ONE EXON IN THE GENE:
      if ($seen_start == 0 && $type eq 'start') { $new_exons = $cstart;}
      if ($seen_end   == 0 && $type eq 'stop')  { $new_exons = $cstop; }
   }

   if (substr($new_exons,0,1) eq ',') { $new_exons = substr($new_exons,1,length($new_exons)-1);}
   if (substr($new_exons,length($new_exons)-1,1) eq ',') { chop($new_exons);                   }
   return($new_exons);
}

#------------------------------------------------------------------#

# GET THE PERCENT IDENTITY BETWEEN $gene AND $orth:

sub get_pc_id_for_pair
{
   my $CIGAR1              = $_[0];
   my $CIGAR2              = $_[1];
   my $seq1                = $_[2];
   my $seq2                = $_[3];
   my $aln1;
   my $aln2;
   my $i;
   my $num;
   my $char;
   my $j;
   my $offset;
   my $char2;
   my $id;
   my $nongap;

   # MAKE THE ALIGNMENT FOR $gene:
   $num                    = "";
   $aln1                   = "";
   $offset                 = 0;
   for ($i = 1; $i <= length($CIGAR1); $i++)
   {
      $char                = substr($CIGAR1,$i-1,1);
      if ($char =~ /[0-9]/) { $num = $num.$char;}
      else
      {
         if ($char ne 'M' & $char ne 'D') { print STDERR "ERROR: char $char\n"; exit;}
         if ($char eq 'D')
         {
            for ($j = 1; $j <= $num; $j++) { $aln1 = $aln1."-";}
         }
         elsif ($char eq 'M')
         {
            for ($j = 1; $j <= $num; $j++)
            {
               $offset++;
               if ($offset > length($seq1))
               {
                  print STDERR "ERROR: offset $offset seq1 $seq1\n";
                  exit;
               }
               $char2      = substr($seq1,$offset-1,1);
               $aln1       = $aln1.$char2;
            }
         }
         $num              = "";
      }
   }

   # MAKE THE ALIGNMENT FOR $orth:
   $num                    = "";
   $aln2                   = "";
   $offset                 = 0;
   for ($i = 1; $i <= length($CIGAR2); $i++)
   {
      $char                = substr($CIGAR2,$i-1,1);
      if ($char =~ /[0-9]/) { $num = $num.$char;}
      else
      {
         if ($char ne 'M' & $char ne 'D') { print STDERR "ERROR: char $char\n"; exit;}
         if ($char eq 'D')
         {
            for ($j = 1; $j <= $num; $j++) { $aln2 = $aln2."-";}
         }
         elsif ($char eq 'M')
         {
            for ($j = 1; $j <= $num; $j++)
            {
               $offset++;
               if ($offset > length($seq2))
               {
                  print STDERR "ERROR: offset $offset seq2 $seq2\n";
                  exit;
               }
               $char2      = substr($seq2,$offset-1,1);
               $aln2       = $aln2.$char2;
            }
         }
         $num              = "";
      }
   }

   # FIND THE PERCENT IDENTITY BETWEEN THE SEQUENCES:
   if (length($aln1) != length($aln2))
   {
      print STDERR "ERROR: length of aln for gene is not same as length for orth\n";
      exit;
   }
   $nongap                 = 0;
   $id                     = 0;
   for ($i = 1; $i <= length($aln1); $i++)
   {
      $char1               = substr($aln1,$i-1,1);
      $char2               = substr($aln2,$i-1,1);
      if ($char1 ne '-' && $char2 ne '-')
      {
         $nongap++;
         if ($char1 eq $char2) { $id++;}
      }
   }
   if ($nongap >= 0)
   {
      $id                  = $id*100/($nongap);
   }
   else
   {
      $id                  = 0;
   }

   return($id);
}

#------------------------------------------------------------------#