avrilcoghlan/find_pc_id_to_treefam_ortholog.pl

## find_pc_id_to_treefam_ortholog.pl
#!/usr/local/bin/perl

#
# Perl script find_pc_id_to_treefam_ortholog.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 4-Dec-07.
#
# This perls scripts finds the %id to the TreeFam ortholog in C. briggsae, human, yeast
# each gene in the ngasp benchmark gene set.
#
# The command-line format is:
# % perl <find_pc_id_to_treefam_ortholog.pl> gff
# where gff is the input benchmark gff file.
#
#------------------------------------------------------------------#

# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

$num_args                  = $#ARGV + 1;
if ($num_args != 1)
{
   print "Usage of find_pc_id_to_treefam_ortholog.pl\n\n";
   print "perl find_pc_id_to_treefam_ortholog.pl <gff>\n";
   print "where <gff> is the input benchmark gff file.\n";
   print "For example, >perl find_pc_id_to_treefam_ortholog.pl phase1_confirmed_isoforms_A.gff.1\n";
   exit;
}

# READ IN MY PERL MODULES:

BEGIN {
    unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
}

use Avril_modules;
use DBI;

# FIND THE NAME OF THE INPUT GFF FILE:

$gff                       = $ARGV[0];

$VERBOSE                   = 0;

#------------------------------------------------------------------#

# READ IN THE INPUT GFF FILE:

%TAKE                      = ();
open(GFF,"$gff") || die "ERROR: cannot open $gff.\n";
while(<GFF>)
{
   $line                   = $_;
   chomp $line;
   @temp                   = split(/\t+/,$line);
   $exon                   = $temp[8];

   # FIND THE GENE NAME:
   # eg. ID=CDS:Transcript:C49A1.4b.1;Name=C49A1.4b.1;Parent=Transcript:C49A1.4b.1
   @temp                   = split(/\;/,$exon);
   $gene                   = $temp[1]; # Name=C49A1.4b.1 THIS IS THE TRANSCRIPT NAME.
   @temp                   = split(/\./,$gene);
   $gene                   = $temp[0].".".$temp[1]; # eg. Name=C49A1.4b
   @temp                   = split(/=/,$gene);
   $gene                   = $temp[1]; # eg. C49A1.4b
   $gene                   =~ tr/[a-z]/[A-Z]/;
   if (substr($gene,length($gene)-1,1) =~ /[A-Z]/) { chop($gene);} # eg. C49A1.4 THIS IS THE GENE NAME.
   $TAKE{$gene}            = 1;
}
close(GFF);

#------------------------------------------------------------------#

# FIND THE TREEFAM ID FOR ALL THE GENES FROM C. ELEGANS THAT I AM INTERESTED IN:

# HASH TABLES TO REMEMBER THE SPECIES OF GENES:
$dbh                       = DBI->connect("dbi:mysql:treefam_5:db.treefam.org:3308", 'anonymous', '') || return;

# SPECIFY THE TABLE:
$table_w                   = 'genes';

# THE FIRST COLUMN IN THIS TABLE IS AN IDENTIFIER (IDX),
# ANOTHER COLUMN IS THE TRANSCRIPT NAME, AND THE LAST COLUMN
# IS THE TAXONOMY ID, eg. 1 ENSANGT00000032162.1 7165

$st                        = "SELECT IDX, TAX_ID, GID, ID from $table_w";
$sth                       = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv                        = $sth->execute or die "Cannot execute the query: $sth->errstr";
%IDX                       = ();
%ID                        = ();
%YEAST                     = ();
%CB                        = ();
%CR                        = ();
%HUMAN                     = ();
if ($rv >= 1)
{
   while ((@array) = $sth->fetchrow_array)    {
      $IDX                 = $array[0];  # eg., 1
      $TAXID               = $array[1];  # eg 7165
      $GID                 = $array[2];
      $ID                  = $array[3];

      # CHECK IF THIS IS A C. ELEGANS GENE THAT WE WANT TO TAKE:
      if ($TAKE{$GID})
      {
         if (!($IDX{$GID})) { $IDX{$GID} = $IDX;}
         else {$IDX{$GID} = $IDX{$GID}.",".$IDX;}
         if (!($ID{$GID})) { $ID{$GID} = $ID;}
         else {$ID{$GID} = $ID{$GID}.",".$ID;}
      }
      # RECORD IF IT IS A YEAST GENE:
      if ($TAXID eq '4932')
      {
         $YEAST{$IDX}      = $GID;
         if (!($ID{$GID})) { $ID{$GID} = $ID;}
         else {$ID{$GID} = $ID{$GID}.",".$ID;}
      }
      # RECORD IF IT IS A C. BRIGGSAE GENE:
      if ($TAXID eq '6238')
      {
         $CB{$IDX}         = $GID;
         if (!($ID{$GID})) { $ID{$GID} = $ID;}
         else {$ID{$GID} = $ID{$GID}.",".$ID;}
      }
      # RECORD IF IT IS A C. REMANEI GENE:
      if ($TAXID eq '31234')
      {
         $CR{$IDX}         = $GID;
         if (!($ID{$GID})) { $ID{$GID} = $ID;}
         else {$ID{$GID} = $ID{$GID}.",".$ID;}
      }
      # RECORD IF IT IS A HUMAN GENE:
      if ($TAXID eq '9606')
      {
         $HUMAN{$IDX}      = $GID;
         if (!($ID{$GID})) { $ID{$GID} = $ID;}
         else {$ID{$GID} = $ID{$GID}.",".$ID;}
      }
   }
}

# NOW FIND THE FAMILIES FOR THE C. ELEGANS GENES OF INTEREST:
%FAM                       = ();
%SEEN                      = ();
foreach $GID (keys %TAKE)
{
   $IDS                    = $ID{$GID};
   @IDS                    = split(/\,/,$IDS);
   for ($i = 0; $i <= $#IDS; $i++)
   {
      $ID                  = $IDS[$i];
      # GET THE FAMILY FOR THE ID $ID:
      $table_w             = 'fam_genes';
      $st                  = "SELECT ID,AC,FLAG,FAM_TYPE from $table_w WHERE ID=?";
      $sth                 = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
      $rv                  = $sth->execute($ID) or die "Cannot execute the query: $sth->errstr";
      if ($rv >= 1)
      {
         while ((@array) = $sth->fetchrow_array)
         {
            $AC            = $array[1];
            $FLAG          = $array[2];
            $FAM_TYPE      = $array[3];
            if ($FLAG eq 'FULL' & ($FAM_TYPE eq 'A' || $FAM_TYPE eq 'B'))
            {
               if (!($SEEN{$GID."_".$AC}))
               {
                  if (!($FAM{$GID})) { $FAM{$GID} = $AC;                      }
                  else { print STDERR "$GID is in $FAM{$GID} and $AC\n"; exit;}
               }
            }
         }
      }
   }
}

# NOW FIND ALL THE ORTHOLOGS FOR THE C. ELEGANS GENES OF INTEREST:
%YEAST_ORTH                = ();
%CB_ORTH                   = ();
%CR_ORTH                   = ();
%HUMAN_ORTH                = ();
foreach $GID (keys %IDX)
{
   $IDXS                   = $IDX{$GID};
   @IDXS                   = split(/\,/,$IDXS);
   for ($i = 0; $i <= $#IDXS; $i++)
   {
      $IDX                 = $IDXS[$i];
      # GET ALL THE ORTHOLGOS FOR $IDX:
      $table_w             = 'ortholog';
      $st                  = "SELECT idx1,idx2 from $table_w WHERE idx1=?";
      $sth                 = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
      $rv                  = $sth->execute($IDX) or die "Cannot execute the query: $sth->errstr";
      if ($rv >= 1)
      {
         while ((@array) = $sth->fetchrow_array)
         {
            $idx1          = $array[0];
            $idx2          = $array[1];
            if ($YEAST{$idx2})
            {
               if (!($YEAST_ORTH{$GID})) { $YEAST_ORTH{$GID} = $YEAST{$idx2};}
               else {$YEAST_ORTH{$GID} = $YEAST_ORTH{$GID}.",".$YEAST{$idx2};}
            }
            if ($CB{$idx2})
            {
               if (!($CB_ORTH{$GID})) { $CB_ORTH{$GID} = $CB{$idx2};}
               else {$CB_ORTH{$GID} = $CB_ORTH{$GID}.",".$CB{$idx2};}
            }
            if ($CR{$idx2})
            {
               if (!($CR_ORTH{$GID})) { $CR_ORTH{$GID} = $CR{$idx2};}
               else {$CR_ORTH{$GID} = $CR_ORTH{$GID}.",".$CR{$idx2};}
            }
            if ($HUMAN{$idx2})
            {
               if (!($HUMAN_ORTH{$GID})) { $HUMAN_ORTH{$GID} = $HUMAN{$idx2};}
               else {$HUMAN_ORTH{$GID} = $HUMAN_ORTH{$GID}.",".$HUMAN{$idx2};}
            }
         }
      }
      $st                  = "SELECT idx1,idx2 from $table_w WHERE idx2=?";
      $sth                 = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
      $rv                  = $sth->execute($IDX) or die "Cannot execute the query: $sth->errstr";
      if ($rv >= 1)
      {
         while ((@array) = $sth->fetchrow_array)
         {
            $idx1          = $array[0];
            $idx2          = $array[1];
            if ($YEAST{$idx1})
            {
               if (!($YEAST_ORTH{$GID})) { $YEAST_ORTH{$GID} = $YEAST{$idx1};}
               else {$YEAST_ORTH{$GID} = $YEAST_ORTH{$GID}.",".$YEAST{$idx1};}
            }
            if ($CB{$idx1})
            {
               if (!($CB_ORTH{$GID})) { $CB_ORTH{$GID} = $CB{$idx1};}
               else {$CB_ORTH{$GID} = $CB_ORTH{$GID}.",".$CB{$idx1};}
            }
            if ($CR{$idx1})
            {
               if (!($CR_ORTH{$GID})) { $CR_ORTH{$GID} = $CR{$idx1};}
               else {$CR_ORTH{$GID} = $CR_ORTH{$GID}.",".$CR{$idx1};}
            }
            if ($HUMAN{$idx1})
            {
               if (!($HUMAN_ORTH{$GID})) { $HUMAN_ORTH{$GID} = $HUMAN{$idx1};}
               else {$HUMAN_ORTH{$GID} = $HUMAN_ORTH{$GID}.",".$HUMAN{$idx1};}
            }

         }
      }
   }
}

#------------------------------------------------------------------#

# READ IN THE INPUT GFF FILE:

print "GENE CBORTH CRORTH HUMORTH YEASTORTH\n";
%SEEN                      = ();
open(GFF,"$gff") || die "ERROR: cannot open $gff.\n";
while(<GFF>)
{
   $line                   = $_;
   chomp $line;
   @temp                   = split(/\t+/,$line);
   $exon                   = $temp[8];

   # FIND THE GENE NAME:
   # eg. ID=CDS:Transcript:C49A1.4b.1;Name=C49A1.4b.1;Parent=Transcript:C49A1.4b.1
   @temp                   = split(/\;/,$exon);
   $gene                   = $temp[1]; # Name=C49A1.4b.1 THIS IS THE TRANSCRIPT NAME.
   @temp                   = split(/\./,$gene);
   $gene                   = $temp[0].".".$temp[1]; # eg. Name=C49A1.4b
   @temp                   = split(/=/,$gene);
   $gene                   = $temp[1]; # eg. C49A1.4b
   $gene                   =~ tr/[a-z]/[A-Z]/;
   if (substr($gene,length($gene)-1,1) =~ /[A-Z]/) { chop($gene);} # eg. C49A1.4 THIS IS THE GENE NAME.

   # SEE IF THIS GENE HAS C. BRIGGSAE, C. REMANEI, HUMAN AND YEAST ORTHOLOGS:
   if ($CB_ORTH{$gene}) { $cb_orth = $CB_ORTH{$gene};}
   else                 { $cb_orth = "none";         }
   if ($CR_ORTH{$gene}) { $cr_orth = $CR_ORTH{$gene};}
   else                 { $cr_orth = "none";         }
   if ($HUMAN_ORTH{$gene}) { $human_orth = $HUMAN_ORTH{$gene};}
   else                    { $human_orth = "none";            }
   if ($YEAST_ORTH{$gene}) { $yeast_orth = $YEAST_ORTH{$gene};}
   else                    { $yeast_orth = "none";            }
   if ($VERBOSE == 1) { print "gene $gene cb_orth $cb_orth cr_orth $cr_orth human_orth $human_orth yeast_orth $yeast_orth\n";}
   if (!($SEEN{$gene}))
   {
      $SEEN{$gene}         = 1;
      # GET THE FAMILY THAT THIS GENE IS FOUND IN:
      if ($FAM{$gene})
      {
         $family           = $FAM{$gene};
         if ($VERBOSE == 1) { print "C. elegans gene $gene is in family $family\n";}
         # GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND C. BRIGGSAE ORTHOLOGS IN THIS FAMILY:
         if ($cb_orth ne 'none')
         {
            $cb_orth_id    = &get_pc_id($gene,$cb_orth,$family);
         }
         else { $cb_orth_id = 0;}
         # GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND C. REMANEI ORTHOLOGS IN THIS FAMILY:
         if ($cr_orth ne 'none')
         {
            $cr_orth_id    = &get_pc_id($gene,$cr_orth,$family);
         }
         else { $cr_orth_id = 0;}
         # GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND HUMAN ORTHOLOGS IN THIS FAMILY:
         if ($human_orth ne 'none')
         {
            $human_orth_id = &get_pc_id($gene,$human_orth,$family);
         }
         else { $human_orth_id = 0;}
         # GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND YEAST ORTHOLOGS IN THIS FAMILY:
         if ($yeast_orth ne 'none')
         {
            $yeast_orth_id = &get_pc_id($gene,$yeast_orth,$family);
         }
         else { $yeast_orth_id = 0;}
      }
      else # THE C. ELEGANS GENE IS NOT IN A FAMILY.
      {
         if ($cb_orth ne 'none' || $cr_orth ne 'none' || $human_orth ne 'none' || $yeast_orth ne 'none')
         {
            # xxx this is an error in treefam-5 mysql I think.
            print STDERR "WARNING: $gene has orthologs but is not in a family\n";
         }
         $cr_orth_id       = 0;
         $cb_orth_id       = 0;
         $human_orth_id    = 0;
         $yeast_orth_id    = 0;
      }
      print "$gene $cb_orth $cb_orth_id $cr_orth $cr_orth_id $human_orth $human_orth_id $yeast_orth $yeast_orth_id\n";
   }
}
close(GFF);

#------------------------------------------------------------------#

print STDERR "FINISHED.\n";

#------------------------------------------------------------------#

# GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND C. BRIGGSAE ORTHOLOGS IN THIS FAMILY:

sub get_pc_id
{
   my $gene                = $_[0];
   my $orths               = $_[1];
   my $family              = $_[2];
   my $IDS;
   my @IDS;
   my $i;
   my $ID;
   my $table_w;
   my $st;
   my $sth;
   my $rv;
   my @array;
   my $CIGAR1              = "none";
   my $CIGAR2              = "none";
   my @orths;
   my $orth;
   my $j;
   my $the_id1;
   my $the_id2;
   my $seq1;
   my $seq2;
   my $pc_id;
   my $max_pc_id           = 0;

   # GET THE ALIGNMENT FOR GENE $gene IN FAMILY $family:
   if (!($ID{$gene})) { print STDERR "ERROR: do not know ids for $gene\n"; exit;}
   $IDS                    = $ID{$gene};
   @IDS                    = split(/\,/,$IDS);
   if ($VERBOSE == 1) { print "C. elegans gene $gene has ids $IDS\n";}
   for ($i = 0; $i <= $#IDS; $i++)
   {
      $ID                  = $IDS[$i];
      # GET THE ALIGNMENT FOR THE ID $ID IN FAMILY $family:
      $table_w             = 'aa_full_align';
      $st                  = "SELECT ID,CIGAR from $table_w WHERE AC=?";
      $sth                 = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
      $rv                  = $sth->execute($family) or die "Cannot execute the query: $sth->errstr";
      if ($rv >= 1)
      {
         while ((@array) = $sth->fetchrow_array)
         {
            if ($array[0] eq $ID)
            {
               if ($CIGAR1 ne 'none') { print STDERR "ERROR: already have cigar for $ID $gene $family\n"; exit;}
               $CIGAR1     = $array[1];
               $the_id1    = $ID;
            }
         }
      }
   }
   if ($CIGAR1 eq 'none')
   {
      print STDERR "WARNING: did not find cigar for IDS $IDS gene $gene $family\n";
      return($max_pc_id); #xxx probelm with treefam5
      #xxx exit;
   }

   # GET THE SEQUENCE FOR $the_id1:
   $table_w                = 'aa_seq';
   $st                     = "SELECT SEQ from $table_w WHERE ID=?";
   $sth                    = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
   $rv                     = $sth->execute($the_id1) or die "Cannot execute the query: $sth->errstr";
   $seq1                   = "none";
   if ($rv >= 1)
   {
      while ((@array)= $sth->fetchrow_array)
      {
         $seq1             = $array[0];
      }
   }
   if ($seq1 eq 'none') { print STDERR "ERROR: seq1 $seq1 for $the_id1 $gene\n"; exit;}

   # GET THE ALIGNMENT FOR THE ORTHOLOGS $orths IN FAMILY $family:
   @orths                  = split(/\,/,$orths);
   for ($i = 0; $i <= $#orths; $i++)
   {
      $orth                = $orths[$i];
      if (!($ID{$orth})) { print STDERR "ERROR: do not know ids for $orth\n"; exit;}
      $IDS                 = $ID{$orth};
      @IDS                 = split(/\,/,$IDS);
      $CIGAR2              = "none";
      if ($VERBOSE == 1) { print "Ortholog $orth has ids $IDS\n";}
      for ($j = 0; $j <= $#IDS; $j++)
      {
         $ID               = $IDS[$j];
         if ($VERBOSE == 1) { print "Looking for cigar for $ID (orth $orth)\n";}
         # GET THE ALIGNMENT FOR THE ID $ID IN FAMILY $family:
         $table_w          = 'aa_full_align';
         $st               = "SELECT ID,CIGAR from $table_w WHERE AC=?";
         $sth              = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
         $rv               = $sth->execute($family) or die "Cannot execute the query: $sth->errstr";
         if ($rv >= 1)
         {
            while ((@array)= $sth->fetchrow_array)
            {
               if ($array[0] eq $ID)
               {
                  if ($CIGAR2 ne 'none') { print STDERR "ERROR: already have cigar for $ID $gene $family\n"; exit;}
                  $CIGAR2  = $array[1];
                  $the_id2 = $ID;
                  if ($VERBOSE == 1) { print "Got aln for $the_id2 family $family\n";}
               }
            }
         }
      }
      if ($CIGAR2 eq 'none')
      {
         print STDERR "WARNING: did not find cigar for IDS $IDS orth $orth $family (gene $gene)\n";
         #xxx exit; #xxx problem with treefam5
         return($max_pc_id);
      }

      # GET THE SEQUENCE FOR $the_id2:
      $table_w             = 'aa_seq';
      $st                  = "SELECT SEQ from $table_w WHERE ID=?";
      $sth                 = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
      $rv                  = $sth->execute($the_id2) or die "Cannot execute the query: $sth->errstr";
      $seq2                = "none";
      if ($rv >= 1)
      {
         while ((@array)= $sth->fetchrow_array)
         {
            $seq2          = $array[0];
         }
      }
      if ($seq2 eq 'none') { print STDERR "ERROR: seq2 $seq2 for $the_id2 $orth\n"; exit;}

      # GET THE PERCENT IDENTITY BETWEEN $gene AND $orth:
      $pc_id               = &get_pc_id_for_pair($CIGAR1,$CIGAR2,$seq1,$seq2);
      if ($pc_id > $max_pc_id) { $max_pc_id = $pc_id;}
   }

   return($max_pc_id);
}

#------------------------------------------------------------------#

# GET THE PERCENT IDENTITY BETWEEN $gene AND $orth:

sub get_pc_id_for_pair
{
   my $CIGAR1              = $_[0];
   my $CIGAR2              = $_[1];
   my $seq1                = $_[2];
   my $seq2                = $_[3];
   my $aln1;
   my $aln2;
   my $i;
   my $num;
   my $char;
   my $j;
   my $offset;
   my $char2;
   my $id;
   my $nongap;

   # MAKE THE ALIGNMENT FOR $gene:
   $num                    = "";
   $aln1                   = "";
   $offset                 = 0;
   for ($i = 1; $i <= length($CIGAR1); $i++)
   {
      $char                = substr($CIGAR1,$i-1,1);
      if ($char =~ /[0-9]/) { $num = $num.$char;}
      else
      {
         if ($char ne 'M' & $char ne 'D') { print STDERR "ERROR: char $char\n"; exit;}
         if ($char eq 'D')
         {
            for ($j = 1; $j <= $num; $j++) { $aln1 = $aln1."-";}
         }
         elsif ($char eq 'M')
         {
            for ($j = 1; $j <= $num; $j++)
            {
               $offset++;
               if ($offset > length($seq1))
               {
                  print STDERR "ERROR: offset $offset seq1 $seq1\n";
                  exit;
               }
               $char2      = substr($seq1,$offset-1,1);
               $aln1       = $aln1.$char2;
            }
         }
         $num              = "";
      }
   }

   # MAKE THE ALIGNMENT FOR $orth:
   $num                    = "";
   $aln2                   = "";
   $offset                 = 0;
   for ($i = 1; $i <= length($CIGAR2); $i++)
   {
      $char                = substr($CIGAR2,$i-1,1);
      if ($char =~ /[0-9]/) { $num = $num.$char;}
      else
      {
         if ($char ne 'M' & $char ne 'D') { print STDERR "ERROR: char $char\n"; exit;}
         if ($char eq 'D')
         {
            for ($j = 1; $j <= $num; $j++) { $aln2 = $aln2."-";}
         }
         elsif ($char eq 'M')
         {
            for ($j = 1; $j <= $num; $j++)
            {
               $offset++;
               if ($offset > length($seq2))
               {
                  print STDERR "ERROR: offset $offset seq2 $seq2\n";
                  exit;
               }
               $char2      = substr($seq2,$offset-1,1);
               $aln2       = $aln2.$char2;
            }
         }
         $num              = "";
      }
   }

   # FIND THE PERCENT IDENTITY BETWEEN THE SEQUENCES:
   if (length($aln1) != length($aln2))
   {
      print STDERR "ERROR: length of aln for gene is not same as length for orth\n";
      exit;
   }
   $nongap                 = 0;
   $id                     = 0;
   for ($i = 1; $i <= length($aln1); $i++)
   {
      $char1               = substr($aln1,$i-1,1);
      $char2               = substr($aln2,$i-1,1);
      if ($char1 ne '-' && $char2 ne '-')
      {
         $nongap++;
         if ($char1 eq $char2) { $id++;}
      }
   }
   if ($nongap >= 0)
   {
      $id                  = $id*100/($nongap);
   }
   else
   {
      $id                  = 0;
   }

   return($id);
}

#------------------------------------------------------------------#
	#!/usr/local/bin/perl

	#
	# Perl script find_pc_id_to_treefam_ortholog.pl
	# Written by Avril Coghlan (alc@sanger.ac.uk)
	# 4-Dec-07.
	#
	# This perls scripts finds the %id to the TreeFam ortholog in C. briggsae, human, yeast
	# each gene in the ngasp benchmark gene set.
	#
	# The command-line format is:
	# % perl <find_pc_id_to_treefam_ortholog.pl> gff
	# where gff is the input benchmark gff file.
	#
	#------------------------------------------------------------------#

	# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

	$num_args = $#ARGV + 1;
	if ($num_args != 1)
	{
	print "Usage of find_pc_id_to_treefam_ortholog.pl\n\n";
	print "perl find_pc_id_to_treefam_ortholog.pl <gff>\n";
	print "where <gff> is the input benchmark gff file.\n";
	print "For example, >perl find_pc_id_to_treefam_ortholog.pl phase1_confirmed_isoforms_A.gff.1\n";
	exit;
	}

	# READ IN MY PERL MODULES:

	BEGIN {
	unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
	}

	use Avril_modules;
	use DBI;

	# FIND THE NAME OF THE INPUT GFF FILE:

	$gff = $ARGV[0];

	$VERBOSE = 0;

	#------------------------------------------------------------------#

	# READ IN THE INPUT GFF FILE:

	%TAKE = ();
	open(GFF,"$gff") \|\| die "ERROR: cannot open $gff.\n";
	while(<GFF>)
	{
	$line = $_;
	chomp $line;
	@temp = split(/\t+/,$line);
	$exon = $temp[8];

	# FIND THE GENE NAME:
	# eg. ID=CDS:Transcript:C49A1.4b.1;Name=C49A1.4b.1;Parent=Transcript:C49A1.4b.1
	@temp = split(/\;/,$exon);
	$gene = $temp[1]; # Name=C49A1.4b.1 THIS IS THE TRANSCRIPT NAME.
	@temp = split(/\./,$gene);
	$gene = $temp[0].".".$temp[1]; # eg. Name=C49A1.4b
	@temp = split(/=/,$gene);
	$gene = $temp[1]; # eg. C49A1.4b
	$gene =~ tr/[a-z]/[A-Z]/;
	if (substr($gene,length($gene)-1,1) =~ /[A-Z]/) { chop($gene);} # eg. C49A1.4 THIS IS THE GENE NAME.
	$TAKE{$gene} = 1;
	}
	close(GFF);

	#------------------------------------------------------------------#

	# FIND THE TREEFAM ID FOR ALL THE GENES FROM C. ELEGANS THAT I AM INTERESTED IN:

	# HASH TABLES TO REMEMBER THE SPECIES OF GENES:
	$dbh = DBI->connect("dbi:mysql:treefam_5:db.treefam.org:3308", 'anonymous', '') \|\| return;

	# SPECIFY THE TABLE:
	$table_w = 'genes';

	# THE FIRST COLUMN IN THIS TABLE IS AN IDENTIFIER (IDX),
	# ANOTHER COLUMN IS THE TRANSCRIPT NAME, AND THE LAST COLUMN
	# IS THE TAXONOMY ID, eg. 1 ENSANGT00000032162.1 7165

	$st = "SELECT IDX, TAX_ID, GID, ID from $table_w";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	%IDX = ();
	%ID = ();
	%YEAST = ();
	%CB = ();
	%CR = ();
	%HUMAN = ();
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array) {
	$IDX = $array[0]; # eg., 1
	$TAXID = $array[1]; # eg 7165
	$GID = $array[2];
	$ID = $array[3];

	# CHECK IF THIS IS A C. ELEGANS GENE THAT WE WANT TO TAKE:
	if ($TAKE{$GID})
	{
	if (!($IDX{$GID})) { $IDX{$GID} = $IDX;}
	else {$IDX{$GID} = $IDX{$GID}.",".$IDX;}
	if (!($ID{$GID})) { $ID{$GID} = $ID;}
	else {$ID{$GID} = $ID{$GID}.",".$ID;}
	}
	# RECORD IF IT IS A YEAST GENE:
	if ($TAXID eq '4932')
	{
	$YEAST{$IDX} = $GID;
	if (!($ID{$GID})) { $ID{$GID} = $ID;}
	else {$ID{$GID} = $ID{$GID}.",".$ID;}
	}
	# RECORD IF IT IS A C. BRIGGSAE GENE:
	if ($TAXID eq '6238')
	{
	$CB{$IDX} = $GID;
	if (!($ID{$GID})) { $ID{$GID} = $ID;}
	else {$ID{$GID} = $ID{$GID}.",".$ID;}
	}
	# RECORD IF IT IS A C. REMANEI GENE:
	if ($TAXID eq '31234')
	{
	$CR{$IDX} = $GID;
	if (!($ID{$GID})) { $ID{$GID} = $ID;}
	else {$ID{$GID} = $ID{$GID}.",".$ID;}
	}
	# RECORD IF IT IS A HUMAN GENE:
	if ($TAXID eq '9606')
	{
	$HUMAN{$IDX} = $GID;
	if (!($ID{$GID})) { $ID{$GID} = $ID;}
	else {$ID{$GID} = $ID{$GID}.",".$ID;}
	}
	}
	}

	# NOW FIND THE FAMILIES FOR THE C. ELEGANS GENES OF INTEREST:
	%FAM = ();
	%SEEN = ();
	foreach $GID (keys %TAKE)
	{
	$IDS = $ID{$GID};
	@IDS = split(/\,/,$IDS);
	for ($i = 0; $i <= $#IDS; $i++)
	{
	$ID = $IDS[$i];
	# GET THE FAMILY FOR THE ID $ID:
	$table_w = 'fam_genes';
	$st = "SELECT ID,AC,FLAG,FAM_TYPE from $table_w WHERE ID=?";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute($ID) or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$AC = $array[1];
	$FLAG = $array[2];
	$FAM_TYPE = $array[3];
	if ($FLAG eq 'FULL' & ($FAM_TYPE eq 'A' \|\| $FAM_TYPE eq 'B'))
	{
	if (!($SEEN{$GID."_".$AC}))
	{
	if (!($FAM{$GID})) { $FAM{$GID} = $AC; }
	else { print STDERR "$GID is in $FAM{$GID} and $AC\n"; exit;}
	}
	}
	}
	}
	}
	}

	# NOW FIND ALL THE ORTHOLOGS FOR THE C. ELEGANS GENES OF INTEREST:
	%YEAST_ORTH = ();
	%CB_ORTH = ();
	%CR_ORTH = ();
	%HUMAN_ORTH = ();
	foreach $GID (keys %IDX)
	{
	$IDXS = $IDX{$GID};
	@IDXS = split(/\,/,$IDXS);
	for ($i = 0; $i <= $#IDXS; $i++)
	{
	$IDX = $IDXS[$i];
	# GET ALL THE ORTHOLGOS FOR $IDX:
	$table_w = 'ortholog';
	$st = "SELECT idx1,idx2 from $table_w WHERE idx1=?";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute($IDX) or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$idx1 = $array[0];
	$idx2 = $array[1];
	if ($YEAST{$idx2})
	{
	if (!($YEAST_ORTH{$GID})) { $YEAST_ORTH{$GID} = $YEAST{$idx2};}
	else {$YEAST_ORTH{$GID} = $YEAST_ORTH{$GID}.",".$YEAST{$idx2};}
	}
	if ($CB{$idx2})
	{
	if (!($CB_ORTH{$GID})) { $CB_ORTH{$GID} = $CB{$idx2};}
	else {$CB_ORTH{$GID} = $CB_ORTH{$GID}.",".$CB{$idx2};}
	}
	if ($CR{$idx2})
	{
	if (!($CR_ORTH{$GID})) { $CR_ORTH{$GID} = $CR{$idx2};}
	else {$CR_ORTH{$GID} = $CR_ORTH{$GID}.",".$CR{$idx2};}
	}
	if ($HUMAN{$idx2})
	{
	if (!($HUMAN_ORTH{$GID})) { $HUMAN_ORTH{$GID} = $HUMAN{$idx2};}
	else {$HUMAN_ORTH{$GID} = $HUMAN_ORTH{$GID}.",".$HUMAN{$idx2};}
	}
	}
	}
	$st = "SELECT idx1,idx2 from $table_w WHERE idx2=?";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute($IDX) or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$idx1 = $array[0];
	$idx2 = $array[1];
	if ($YEAST{$idx1})
	{
	if (!($YEAST_ORTH{$GID})) { $YEAST_ORTH{$GID} = $YEAST{$idx1};}
	else {$YEAST_ORTH{$GID} = $YEAST_ORTH{$GID}.",".$YEAST{$idx1};}
	}
	if ($CB{$idx1})
	{
	if (!($CB_ORTH{$GID})) { $CB_ORTH{$GID} = $CB{$idx1};}
	else {$CB_ORTH{$GID} = $CB_ORTH{$GID}.",".$CB{$idx1};}
	}
	if ($CR{$idx1})
	{
	if (!($CR_ORTH{$GID})) { $CR_ORTH{$GID} = $CR{$idx1};}
	else {$CR_ORTH{$GID} = $CR_ORTH{$GID}.",".$CR{$idx1};}
	}
	if ($HUMAN{$idx1})
	{
	if (!($HUMAN_ORTH{$GID})) { $HUMAN_ORTH{$GID} = $HUMAN{$idx1};}
	else {$HUMAN_ORTH{$GID} = $HUMAN_ORTH{$GID}.",".$HUMAN{$idx1};}
	}

	}
	}
	}
	}

	#------------------------------------------------------------------#

	# READ IN THE INPUT GFF FILE:

	print "GENE CBORTH CRORTH HUMORTH YEASTORTH\n";
	%SEEN = ();
	open(GFF,"$gff") \|\| die "ERROR: cannot open $gff.\n";
	while(<GFF>)
	{
	$line = $_;
	chomp $line;
	@temp = split(/\t+/,$line);
	$exon = $temp[8];

	# FIND THE GENE NAME:
	# eg. ID=CDS:Transcript:C49A1.4b.1;Name=C49A1.4b.1;Parent=Transcript:C49A1.4b.1
	@temp = split(/\;/,$exon);
	$gene = $temp[1]; # Name=C49A1.4b.1 THIS IS THE TRANSCRIPT NAME.
	@temp = split(/\./,$gene);
	$gene = $temp[0].".".$temp[1]; # eg. Name=C49A1.4b
	@temp = split(/=/,$gene);
	$gene = $temp[1]; # eg. C49A1.4b
	$gene =~ tr/[a-z]/[A-Z]/;
	if (substr($gene,length($gene)-1,1) =~ /[A-Z]/) { chop($gene);} # eg. C49A1.4 THIS IS THE GENE NAME.

	# SEE IF THIS GENE HAS C. BRIGGSAE, C. REMANEI, HUMAN AND YEAST ORTHOLOGS:
	if ($CB_ORTH{$gene}) { $cb_orth = $CB_ORTH{$gene};}
	else { $cb_orth = "none"; }
	if ($CR_ORTH{$gene}) { $cr_orth = $CR_ORTH{$gene};}
	else { $cr_orth = "none"; }
	if ($HUMAN_ORTH{$gene}) { $human_orth = $HUMAN_ORTH{$gene};}
	else { $human_orth = "none"; }
	if ($YEAST_ORTH{$gene}) { $yeast_orth = $YEAST_ORTH{$gene};}
	else { $yeast_orth = "none"; }
	if ($VERBOSE == 1) { print "gene $gene cb_orth $cb_orth cr_orth $cr_orth human_orth $human_orth yeast_orth $yeast_orth\n";}
	if (!($SEEN{$gene}))
	{
	$SEEN{$gene} = 1;
	# GET THE FAMILY THAT THIS GENE IS FOUND IN:
	if ($FAM{$gene})
	{
	$family = $FAM{$gene};
	if ($VERBOSE == 1) { print "C. elegans gene $gene is in family $family\n";}
	# GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND C. BRIGGSAE ORTHOLOGS IN THIS FAMILY:
	if ($cb_orth ne 'none')
	{
	$cb_orth_id = &get_pc_id($gene,$cb_orth,$family);
	}
	else { $cb_orth_id = 0;}
	# GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND C. REMANEI ORTHOLOGS IN THIS FAMILY:
	if ($cr_orth ne 'none')
	{
	$cr_orth_id = &get_pc_id($gene,$cr_orth,$family);
	}
	else { $cr_orth_id = 0;}
	# GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND HUMAN ORTHOLOGS IN THIS FAMILY:
	if ($human_orth ne 'none')
	{
	$human_orth_id = &get_pc_id($gene,$human_orth,$family);
	}
	else { $human_orth_id = 0;}
	# GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND YEAST ORTHOLOGS IN THIS FAMILY:
	if ($yeast_orth ne 'none')
	{
	$yeast_orth_id = &get_pc_id($gene,$yeast_orth,$family);
	}
	else { $yeast_orth_id = 0;}
	}
	else # THE C. ELEGANS GENE IS NOT IN A FAMILY.
	{
	if ($cb_orth ne 'none' \|\| $cr_orth ne 'none' \|\| $human_orth ne 'none' \|\| $yeast_orth ne 'none')
	{
	# xxx this is an error in treefam-5 mysql I think.
	print STDERR "WARNING: $gene has orthologs but is not in a family\n";
	}
	$cr_orth_id = 0;
	$cb_orth_id = 0;
	$human_orth_id = 0;
	$yeast_orth_id = 0;
	}
	print "$gene $cb_orth $cb_orth_id $cr_orth $cr_orth_id $human_orth $human_orth_id $yeast_orth $yeast_orth_id\n";
	}
	}
	close(GFF);

	#------------------------------------------------------------------#

	print STDERR "FINISHED.\n";

	#------------------------------------------------------------------#

	# GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND C. BRIGGSAE ORTHOLOGS IN THIS FAMILY:

	sub get_pc_id
	{
	my $gene = $_[0];
	my $orths = $_[1];
	my $family = $_[2];
	my $IDS;
	my @IDS;
	my $i;
	my $ID;
	my $table_w;
	my $st;
	my $sth;
	my $rv;
	my @array;
	my $CIGAR1 = "none";
	my $CIGAR2 = "none";
	my @orths;
	my $orth;
	my $j;
	my $the_id1;
	my $the_id2;
	my $seq1;
	my $seq2;
	my $pc_id;
	my $max_pc_id = 0;

	# GET THE ALIGNMENT FOR GENE $gene IN FAMILY $family:
	if (!($ID{$gene})) { print STDERR "ERROR: do not know ids for $gene\n"; exit;}
	$IDS = $ID{$gene};
	@IDS = split(/\,/,$IDS);
	if ($VERBOSE == 1) { print "C. elegans gene $gene has ids $IDS\n";}
	for ($i = 0; $i <= $#IDS; $i++)
	{
	$ID = $IDS[$i];
	# GET THE ALIGNMENT FOR THE ID $ID IN FAMILY $family:
	$table_w = 'aa_full_align';
	$st = "SELECT ID,CIGAR from $table_w WHERE AC=?";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute($family) or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	if ($array[0] eq $ID)
	{
	if ($CIGAR1 ne 'none') { print STDERR "ERROR: already have cigar for $ID $gene $family\n"; exit;}
	$CIGAR1 = $array[1];
	$the_id1 = $ID;
	}
	}
	}
	}
	if ($CIGAR1 eq 'none')
	{
	print STDERR "WARNING: did not find cigar for IDS $IDS gene $gene $family\n";
	return($max_pc_id); #xxx probelm with treefam5
	#xxx exit;
	}

	# GET THE SEQUENCE FOR $the_id1:
	$table_w = 'aa_seq';
	$st = "SELECT SEQ from $table_w WHERE ID=?";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute($the_id1) or die "Cannot execute the query: $sth->errstr";
	$seq1 = "none";
	if ($rv >= 1)
	{
	while ((@array)= $sth->fetchrow_array)
	{
	$seq1 = $array[0];
	}
	}
	if ($seq1 eq 'none') { print STDERR "ERROR: seq1 $seq1 for $the_id1 $gene\n"; exit;}

	# GET THE ALIGNMENT FOR THE ORTHOLOGS $orths IN FAMILY $family:
	@orths = split(/\,/,$orths);
	for ($i = 0; $i <= $#orths; $i++)
	{
	$orth = $orths[$i];
	if (!($ID{$orth})) { print STDERR "ERROR: do not know ids for $orth\n"; exit;}
	$IDS = $ID{$orth};
	@IDS = split(/\,/,$IDS);
	$CIGAR2 = "none";
	if ($VERBOSE == 1) { print "Ortholog $orth has ids $IDS\n";}
	for ($j = 0; $j <= $#IDS; $j++)
	{
	$ID = $IDS[$j];
	if ($VERBOSE == 1) { print "Looking for cigar for $ID (orth $orth)\n";}
	# GET THE ALIGNMENT FOR THE ID $ID IN FAMILY $family:
	$table_w = 'aa_full_align';
	$st = "SELECT ID,CIGAR from $table_w WHERE AC=?";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute($family) or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array)= $sth->fetchrow_array)
	{
	if ($array[0] eq $ID)
	{
	if ($CIGAR2 ne 'none') { print STDERR "ERROR: already have cigar for $ID $gene $family\n"; exit;}
	$CIGAR2 = $array[1];
	$the_id2 = $ID;
	if ($VERBOSE == 1) { print "Got aln for $the_id2 family $family\n";}
	}
	}
	}
	}
	if ($CIGAR2 eq 'none')
	{
	print STDERR "WARNING: did not find cigar for IDS $IDS orth $orth $family (gene $gene)\n";
	#xxx exit; #xxx problem with treefam5
	return($max_pc_id);
	}

	# GET THE SEQUENCE FOR $the_id2:
	$table_w = 'aa_seq';
	$st = "SELECT SEQ from $table_w WHERE ID=?";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute($the_id2) or die "Cannot execute the query: $sth->errstr";
	$seq2 = "none";
	if ($rv >= 1)
	{
	while ((@array)= $sth->fetchrow_array)
	{
	$seq2 = $array[0];
	}
	}
	if ($seq2 eq 'none') { print STDERR "ERROR: seq2 $seq2 for $the_id2 $orth\n"; exit;}

	# GET THE PERCENT IDENTITY BETWEEN $gene AND $orth:
	$pc_id = &get_pc_id_for_pair($CIGAR1,$CIGAR2,$seq1,$seq2);
	if ($pc_id > $max_pc_id) { $max_pc_id = $pc_id;}
	}

	return($max_pc_id);
	}

	#------------------------------------------------------------------#

	# GET THE PERCENT IDENTITY BETWEEN $gene AND $orth:

	sub get_pc_id_for_pair
	{
	my $CIGAR1 = $_[0];
	my $CIGAR2 = $_[1];
	my $seq1 = $_[2];
	my $seq2 = $_[3];
	my $aln1;
	my $aln2;
	my $i;
	my $num;
	my $char;
	my $j;
	my $offset;
	my $char2;
	my $id;
	my $nongap;

	# MAKE THE ALIGNMENT FOR $gene:
	$num = "";
	$aln1 = "";
	$offset = 0;
	for ($i = 1; $i <= length($CIGAR1); $i++)
	{
	$char = substr($CIGAR1,$i-1,1);
	if ($char =~ /[0-9]/) { $num = $num.$char;}
	else
	{
	if ($char ne 'M' & $char ne 'D') { print STDERR "ERROR: char $char\n"; exit;}
	if ($char eq 'D')
	{
	for ($j = 1; $j <= $num; $j++) { $aln1 = $aln1."-";}
	}
	elsif ($char eq 'M')
	{
	for ($j = 1; $j <= $num; $j++)
	{
	$offset++;
	if ($offset > length($seq1))
	{
	print STDERR "ERROR: offset $offset seq1 $seq1\n";
	exit;
	}
	$char2 = substr($seq1,$offset-1,1);
	$aln1 = $aln1.$char2;
	}
	}
	$num = "";
	}
	}

	# MAKE THE ALIGNMENT FOR $orth:
	$num = "";
	$aln2 = "";
	$offset = 0;
	for ($i = 1; $i <= length($CIGAR2); $i++)
	{
	$char = substr($CIGAR2,$i-1,1);
	if ($char =~ /[0-9]/) { $num = $num.$char;}
	else
	{
	if ($char ne 'M' & $char ne 'D') { print STDERR "ERROR: char $char\n"; exit;}
	if ($char eq 'D')
	{
	for ($j = 1; $j <= $num; $j++) { $aln2 = $aln2."-";}
	}
	elsif ($char eq 'M')
	{
	for ($j = 1; $j <= $num; $j++)
	{
	$offset++;
	if ($offset > length($seq2))
	{
	print STDERR "ERROR: offset $offset seq2 $seq2\n";
	exit;
	}
	$char2 = substr($seq2,$offset-1,1);
	$aln2 = $aln2.$char2;
	}
	}
	$num = "";
	}
	}

	# FIND THE PERCENT IDENTITY BETWEEN THE SEQUENCES:
	if (length($aln1) != length($aln2))
	{
	print STDERR "ERROR: length of aln for gene is not same as length for orth\n";
	exit;
	}
	$nongap = 0;
	$id = 0;
	for ($i = 1; $i <= length($aln1); $i++)
	{
	$char1 = substr($aln1,$i-1,1);
	$char2 = substr($aln2,$i-1,1);
	if ($char1 ne '-' && $char2 ne '-')
	{
	$nongap++;
	if ($char1 eq $char2) { $id++;}
	}
	}
	if ($nongap >= 0)
	{
	$id = $id*100/($nongap);
	}
	else
	{
	$id = 0;
	}

	return($id);
	}

	#------------------------------------------------------------------#