avrilcoghlan/find_gene_pred_errors1.pl

## find_gene_pred_errors1.pl
#!/usr/local/bin/perl

#
# Perl script find_gene_pred_errors1.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 25-Feb-09.
#
# For the TreeFam project.
#
# This perl script finds cases where two adjacent genes in a species
# (eg. C. elegans) should probably be merged, as one of them has its
# best match to the first part of a TreeFam family alignment and the
# second has its best match to the second part of the same TreeFam
# family's alignment.
#
# The command-line format is:
# % perl <find_gene_pred_errors1.pl> species release
# where species is the species of interest,
#       release is the release of TreeFam to use.
#
#------------------------------------------------------------------#

# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

$num_args                  = $#ARGV + 1;
if ($num_args != 2)
{
   print "Usage of find_gene_pred_errors1.pl\n\n";
   print "perl find_gene_pred_errors1.pl <species> <release>\n";
   print "where <species> is the species of interest (eg. C. elegans),\n";
   print "      <release> is the release of TreeFam to use.\n";
   print "For example, >perl find_gene_pred_errors1.pl CAEEL 7\n";
   exit;
}

use DBI;

# FIND THE NAME OF THE SPECIES OF INTEREST:

$the_species               = $ARGV[0];

# FIND THE RELEASE OF TREEFAM TO USE:

$release                   = $ARGV[1];

#------------------------------------------------------------------#

$database                  = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
$dbh                       = DBI->connect("$database", 'anonymous', '') || return;

# FIND ALL THE IDs FOR GENES FROM SPECIES $the_species:
($GID,$IDS)                = &get_genes_from_species($dbh,$the_species);

# READ IN THE HMMER MATCHES BETWEEN TRANSCRIPTS AND FAMILIES:
($BEST)                    = &read_hmmer_matches_to_families($dbh,$GID);

# FIND ADJACENT GENES IN SPECIES $the_species, AND CHECK IF THEY HAVE
# THEIR BEST MATCH TO THE SAME TREEFAM FAMILY:
&find_adjacent_genes($dbh,$GID,$IDS,$BEST);

# NOW DISCONNECT FROM THE DATABASE:

$rc                        = $dbh->disconnect();
$rc                        = "";

#------------------------------------------------------------------#

print STDERR "FINISHED.\n";

#------------------------------------------------------------------#

# FIND ADJACENT GENES IN THE SPECIES $the_species AND CHECK IF THEY
# HAVE THEIR BEST MATCH TO THE SAME TREEFAM FAMILY:

sub find_adjacent_genes
{
   my $dbh                 = $_[0];
   my $GID                 = $_[1];
   my $IDS                 = $_[2];
   my $BEST                = $_[3];
   my $table_w;
   my $st;
   my $sth;
   my $rv;
   my @array;
   my $ID;
   my $start;
   my $chr;
   my %GENES               = ();
   my $GID;
   my %SEEN                = ();
   %START                  = ();
   my $genes;
   my @genes;
   my $i;
   my $gene;
   my $prev_gene;
   my $gene_ids;
   my $prev_gene_ids;
   my @gene_ids;
   my @prev_gene_ids;
   my $prev_gene_id;
   my $gene_id;
   my $j;
   my $k;
   my $prev_gene_id_best;
   my $gene_id_best;

   foreach $ID (keys %{$GID})
   {
      $GID                 = $GID{$ID};
      $start               = "none";
      $chr                 = "none";
      $table_w             = 'map';
      $st                  = "SELECT T_START, TARGET from $table_w WHERE ID=\'$ID\'";
      $sth                 = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
      $rv                  = $sth->execute or die "Cannot execute the query: $sth->errstr";
      if ($rv >= 1)
      {
         while ((@array) = $sth->fetchrow_array)
         {
            $start         = $array[0];
      $chr           = $array[1];
         }
      }
      if ($start eq 'none' || $chr eq 'none') { print STDERR "ERROR: ID $ID start $start chr $chr\n"; exit;}
      # RECORD THE LOCATION OF GENE $GID:
      if (!($SEEN{$chr."=".$GID}))
      {
	 $SEEN{$chr."=".$GID} = 1;
         if (!($GENES{$chr}))
         {
            $GENES{$chr}   = $GID;
         }
	 else
	 {
	    $GENES{$chr}   = $GENES{$chr}.",".$GID;
	 }
	 if (!($START{$GID}))
	 {
	    $START{$GID}   = $start;
	 }
	 else
	 {
	    if ($start < $START{$GID})
	    {
	       $START{$GID} = $start;
	    }
	 }
      }
   }

   # CHECK IF ADJACENT GENES HAVE BEST HMMER MATCHES TO THE SAME TREEFAM FAMILY:
   foreach $chr (keys %GENES)
   {
      $genes                = $GENES{$chr};
      @genes                = split(/\,/,$genes);
      @genes                = sort by_start @genes;
      for ($i = 1; $i <= $#genes; $i++)
      {
	 $gene              = $genes[$i];
	 $prev_gene         = $genes[$i-1];
	 # CHECK IF $gene AND $prev_gene HAVE TRANSCRIPTS THAT HAVE BEST HMMER MATCHES
	 # TO THE SAME TREEFAM FAMILY:
	 if (!($IDS{$gene})) { print STDERR "ERROR: do not know transcripts in $gene\n"; exit;}
	 if (!($IDS{$prev_gene})) { print STDERR "ERROR: do not know transcripts in $prev_gene\n"; exit;}
	 $gene_ids          = $IDS->{$gene};
	 $prev_gene_ids     = $IDS->{$prev_gene};
	 @gene_ids          = split(/\,/,$gene_ids);
	 @prev_gene_ids     = split(/\,/,$prev_gene_ids);
	 for ($j = 0; $j <= $#gene_ids; $j++)
	 {
	    $gene_id        = $gene_ids[$j];
	    if (!($BEST->{$gene_id})) { $gene_id_best = "none";           }
	    else                      { $gene_id_best = $BEST->{$gene_id};}
	    for ($k = 0; $k <= $#prev_gene_ids; $k++)
	    {
	       $prev_gene_id = $prev_gene_ids[$k];
	       if (!($BEST->{$prev_gene_id})) { $prev_gene_id_best = "none";                }
	       else                           { $prev_gene_id_best = $BEST->{$prev_gene_id};}
	       if ($gene_id_best eq $prev_gene_id_best && $gene_id_best ne 'none')
	       {
		  print "$gene_id ($gene) and $prev_gene_id ($prev_gene) both have best match to $gene_id_best\n";
	       }
	    }
	 }
      }
   }
}

#------------------------------------------------------------------#

# FIND ALL THE IDs FOR GENES FROM SPECIES $the_species:

sub get_genes_from_species
{
   my $dbh                 = $_[0];
   my $the_species         = $_[1];
   my %GID                 = ();
   my $taxid               = "none";
   my $table_w;
   my $st;
   my $sth;
   my $rv;
   my @array;
   my $ID;
   my $GID;
   my %IDS                 = ();

   # FIRST GET THE NCBI TAXONOMY ID FOR SPECIES $the_species:
   $table_w                = 'species';
   $st                     = "SELECT TAX_ID from $table_w WHERE SWCODE=\'$the_species\'";
   $sth                    = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
   $rv                     = $sth->execute or die "Cannot execute the query: $sth->errstr";
   if ($rv >= 1)
   {
      while ((@array) = $sth->fetchrow_array)
      {
         $taxid            = $array[0];
      }
   }
   if ($taxid eq 'none') { print STDERR "ERROR: do not know taxid for $the_species\n"; exit;}

   # NOW GET THE IDs FOR ALL TRANSCRIPTS FROM SPECIES $the_species:
   $table_w                = 'genes';
   $st                     = "SELECT ID, GID from $table_w WHERE TAX_ID=\'$taxid\'";
   $sth                    = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
   $rv                     = $sth->execute or die "Cannot execute the query: $sth->errstr";
   if ($rv >= 1)
   {
      while ((@array) = $sth->fetchrow_array)
      {
         $ID               = $array[0];
	 $GID              = $array[1];
	 if ($GID{$ID}) { print STDERR "WARNING: already have GID $GID{$ID} for ID $ID (GID $GID)\n";}
	 $GID{$ID}         = $GID;
	 # RECORD THE IDs FOR TRANSCRIPTS IN THIS GENE:
	 if (!($IDS{$GID})) { $IDS{$GID} = $ID;}
	 else {$IDS{$GID} = $IDS{$GID}.",".$ID;}
      }
   }

   return(\%GID,\%IDS);
}

#------------------------------------------------------------------#

# READ IN THE HMMER MATCHES BETWEEN TRANSCRIPTS FROM SPECIES $the_species
# AND FAMILIES, AND FIND THE BEST MATCH FOR EACH TRANSCRIPT:

sub read_hmmer_matches_to_families
{
   my $dbh                 = $_[0];
   my $GID                 = $_[1];
   my $table_w;
   my $st;
   my $sth;
   my $rv;
   my @array;
   my $ID;
   my %BEST                = ();
   my %BEST_EVALUE         = ();
   my %BEST_SCORE          = ();
   my $best;
   my $best_evalue;
   my $best_score;

   foreach $ID (keys %{$GID})
   {
      $table_w             = 'hmmer_matches';
      $st                  = "SELECT AC, SCORE, EVALUE from $table_w";
      $sth                 = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
      $rv                  = $sth->execute or die "Cannot execute the query: $sth->errstr";
      if ($rv >= 1)
      {
         while ((@array) = $sth->fetchrow_array)
         {
            $AC            = $array[0];
	    $SCORE         = $array[1];
	    $EVALUE        = $array[2];
            if (!($BEST{$ID}))
	    {
	       if ($EVALUE == 0) { $EVALUE = "zero";}
	       if ($SCORE  == 0) { $SCORE  = "zero";}
	       $BEST{$ID}  = $AC;
	       $BEST_EVALUE{$ID} = $EVALUE;
	       $BEST_SCORE{$ID} = $SCORE;
	    }
	    else
	    {
	       $best       = $BEST{$ID};
	       $best_evalue= $BEST_EVALUE{$ID};
	       $best_score = $BEST_SCORE{$ID};
	       if ($best_evalue eq 'zero') { $best_evalue = 0;}
	       if ($best_score  eq 'zero') { $best_score  = 0;}
	       if ($EVALUE < $best_evalue || ($EVALUE == $best_evalue && $SCORE > $best_score))
	       {
	          if ($EVALUE == 0) { $EVALUE = "zero";}
	          if ($SCORE  == 0) { $SCORE  = "zero";}
	          $BEST{$ID}  = $AC;
	          $BEST_EVALUE{$ID} = $EVALUE;
	          $BEST_SCORE{$ID} = $SCORE;
	       }
	    }
         }
      }
   }

   return(\%BEST);
}

#------------------------------------------------------------------#

# SUBROUTINE TO SORT GENES BY THEIR START-POINT:

sub by_start
{
   if (!($START{$a})) { print STDERR "ERROR: do not know start of gene a $a.\n"; exit;}
   if (!($START{$b})) { print STDERR "ERROR: do not know start of gene b $b.\n"; exit;}
   $START{$a} <=> $START{$b}
      or
   $a cmp $b;
}

#------------------------------------------------------------------
	#!/usr/local/bin/perl

	#
	# Perl script find_gene_pred_errors1.pl
	# Written by Avril Coghlan (alc@sanger.ac.uk)
	# 25-Feb-09.
	#
	# For the TreeFam project.
	#
	# This perl script finds cases where two adjacent genes in a species
	# (eg. C. elegans) should probably be merged, as one of them has its
	# best match to the first part of a TreeFam family alignment and the
	# second has its best match to the second part of the same TreeFam
	# family's alignment.
	#
	# The command-line format is:
	# % perl <find_gene_pred_errors1.pl> species release
	# where species is the species of interest,
	# release is the release of TreeFam to use.
	#
	#------------------------------------------------------------------#

	# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

	$num_args = $#ARGV + 1;
	if ($num_args != 2)
	{
	print "Usage of find_gene_pred_errors1.pl\n\n";
	print "perl find_gene_pred_errors1.pl <species> <release>\n";
	print "where <species> is the species of interest (eg. C. elegans),\n";
	print " <release> is the release of TreeFam to use.\n";
	print "For example, >perl find_gene_pred_errors1.pl CAEEL 7\n";
	exit;
	}

	use DBI;

	# FIND THE NAME OF THE SPECIES OF INTEREST:

	$the_species = $ARGV[0];

	# FIND THE RELEASE OF TREEFAM TO USE:

	$release = $ARGV[1];

	#------------------------------------------------------------------#

	$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
	$dbh = DBI->connect("$database", 'anonymous', '') \|\| return;

	# FIND ALL THE IDs FOR GENES FROM SPECIES $the_species:
	($GID,$IDS) = &get_genes_from_species($dbh,$the_species);

	# READ IN THE HMMER MATCHES BETWEEN TRANSCRIPTS AND FAMILIES:
	($BEST) = &read_hmmer_matches_to_families($dbh,$GID);

	# FIND ADJACENT GENES IN SPECIES $the_species, AND CHECK IF THEY HAVE
	# THEIR BEST MATCH TO THE SAME TREEFAM FAMILY:
	&find_adjacent_genes($dbh,$GID,$IDS,$BEST);

	# NOW DISCONNECT FROM THE DATABASE:

	$rc = $dbh->disconnect();
	$rc = "";

	#------------------------------------------------------------------#

	print STDERR "FINISHED.\n";

	#------------------------------------------------------------------#

	# FIND ADJACENT GENES IN THE SPECIES $the_species AND CHECK IF THEY
	# HAVE THEIR BEST MATCH TO THE SAME TREEFAM FAMILY:

	sub find_adjacent_genes
	{
	my $dbh = $_[0];
	my $GID = $_[1];
	my $IDS = $_[2];
	my $BEST = $_[3];
	my $table_w;
	my $st;
	my $sth;
	my $rv;
	my @array;
	my $ID;
	my $start;
	my $chr;
	my %GENES = ();
	my $GID;
	my %SEEN = ();
	%START = ();
	my $genes;
	my @genes;
	my $i;
	my $gene;
	my $prev_gene;
	my $gene_ids;
	my $prev_gene_ids;
	my @gene_ids;
	my @prev_gene_ids;
	my $prev_gene_id;
	my $gene_id;
	my $j;
	my $k;
	my $prev_gene_id_best;
	my $gene_id_best;

	foreach $ID (keys %{$GID})
	{
	$GID = $GID{$ID};
	$start = "none";
	$chr = "none";
	$table_w = 'map';
	$st = "SELECT T_START, TARGET from $table_w WHERE ID=\'$ID\'";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$start = $array[0];
	$chr = $array[1];
	}
	}
	if ($start eq 'none' \|\| $chr eq 'none') { print STDERR "ERROR: ID $ID start $start chr $chr\n"; exit;}
	# RECORD THE LOCATION OF GENE $GID:
	if (!($SEEN{$chr."=".$GID}))
	{
	$SEEN{$chr."=".$GID} = 1;
	if (!($GENES{$chr}))
	{
	$GENES{$chr} = $GID;
	}
	else
	{
	$GENES{$chr} = $GENES{$chr}.",".$GID;
	}
	if (!($START{$GID}))
	{
	$START{$GID} = $start;
	}
	else
	{
	if ($start < $START{$GID})
	{
	$START{$GID} = $start;
	}
	}
	}
	}

	# CHECK IF ADJACENT GENES HAVE BEST HMMER MATCHES TO THE SAME TREEFAM FAMILY:
	foreach $chr (keys %GENES)
	{
	$genes = $GENES{$chr};
	@genes = split(/\,/,$genes);
	@genes = sort by_start @genes;
	for ($i = 1; $i <= $#genes; $i++)
	{
	$gene = $genes[$i];
	$prev_gene = $genes[$i-1];
	# CHECK IF $gene AND $prev_gene HAVE TRANSCRIPTS THAT HAVE BEST HMMER MATCHES
	# TO THE SAME TREEFAM FAMILY:
	if (!($IDS{$gene})) { print STDERR "ERROR: do not know transcripts in $gene\n"; exit;}
	if (!($IDS{$prev_gene})) { print STDERR "ERROR: do not know transcripts in $prev_gene\n"; exit;}
	$gene_ids = $IDS->{$gene};
	$prev_gene_ids = $IDS->{$prev_gene};
	@gene_ids = split(/\,/,$gene_ids);
	@prev_gene_ids = split(/\,/,$prev_gene_ids);
	for ($j = 0; $j <= $#gene_ids; $j++)
	{
	$gene_id = $gene_ids[$j];
	if (!($BEST->{$gene_id})) { $gene_id_best = "none"; }
	else { $gene_id_best = $BEST->{$gene_id};}
	for ($k = 0; $k <= $#prev_gene_ids; $k++)
	{
	$prev_gene_id = $prev_gene_ids[$k];
	if (!($BEST->{$prev_gene_id})) { $prev_gene_id_best = "none"; }
	else { $prev_gene_id_best = $BEST->{$prev_gene_id};}
	if ($gene_id_best eq $prev_gene_id_best && $gene_id_best ne 'none')
	{
	print "$gene_id ($gene) and $prev_gene_id ($prev_gene) both have best match to $gene_id_best\n";
	}
	}
	}
	}
	}
	}

	#------------------------------------------------------------------#

	# FIND ALL THE IDs FOR GENES FROM SPECIES $the_species:

	sub get_genes_from_species
	{
	my $dbh = $_[0];
	my $the_species = $_[1];
	my %GID = ();
	my $taxid = "none";
	my $table_w;
	my $st;
	my $sth;
	my $rv;
	my @array;
	my $ID;
	my $GID;
	my %IDS = ();

	# FIRST GET THE NCBI TAXONOMY ID FOR SPECIES $the_species:
	$table_w = 'species';
	$st = "SELECT TAX_ID from $table_w WHERE SWCODE=\'$the_species\'";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$taxid = $array[0];
	}
	}
	if ($taxid eq 'none') { print STDERR "ERROR: do not know taxid for $the_species\n"; exit;}

	# NOW GET THE IDs FOR ALL TRANSCRIPTS FROM SPECIES $the_species:
	$table_w = 'genes';
	$st = "SELECT ID, GID from $table_w WHERE TAX_ID=\'$taxid\'";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$ID = $array[0];
	$GID = $array[1];
	if ($GID{$ID}) { print STDERR "WARNING: already have GID $GID{$ID} for ID $ID (GID $GID)\n";}
	$GID{$ID} = $GID;
	# RECORD THE IDs FOR TRANSCRIPTS IN THIS GENE:
	if (!($IDS{$GID})) { $IDS{$GID} = $ID;}
	else {$IDS{$GID} = $IDS{$GID}.",".$ID;}
	}
	}

	return(\%GID,\%IDS);
	}

	#------------------------------------------------------------------#

	# READ IN THE HMMER MATCHES BETWEEN TRANSCRIPTS FROM SPECIES $the_species
	# AND FAMILIES, AND FIND THE BEST MATCH FOR EACH TRANSCRIPT:

	sub read_hmmer_matches_to_families
	{
	my $dbh = $_[0];
	my $GID = $_[1];
	my $table_w;
	my $st;
	my $sth;
	my $rv;
	my @array;
	my $ID;
	my %BEST = ();
	my %BEST_EVALUE = ();
	my %BEST_SCORE = ();
	my $best;
	my $best_evalue;
	my $best_score;

	foreach $ID (keys %{$GID})
	{
	$table_w = 'hmmer_matches';
	$st = "SELECT AC, SCORE, EVALUE from $table_w";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$AC = $array[0];
	$SCORE = $array[1];
	$EVALUE = $array[2];
	if (!($BEST{$ID}))
	{
	if ($EVALUE == 0) { $EVALUE = "zero";}
	if ($SCORE == 0) { $SCORE = "zero";}
	$BEST{$ID} = $AC;
	$BEST_EVALUE{$ID} = $EVALUE;
	$BEST_SCORE{$ID} = $SCORE;
	}
	else
	{
	$best = $BEST{$ID};
	$best_evalue= $BEST_EVALUE{$ID};
	$best_score = $BEST_SCORE{$ID};
	if ($best_evalue eq 'zero') { $best_evalue = 0;}
	if ($best_score eq 'zero') { $best_score = 0;}
	if ($EVALUE < $best_evalue \|\| ($EVALUE == $best_evalue && $SCORE > $best_score))
	{
	if ($EVALUE == 0) { $EVALUE = "zero";}
	if ($SCORE == 0) { $SCORE = "zero";}
	$BEST{$ID} = $AC;
	$BEST_EVALUE{$ID} = $EVALUE;
	$BEST_SCORE{$ID} = $SCORE;
	}
	}
	}
	}
	}

	return(\%BEST);
	}

	#------------------------------------------------------------------#

	# SUBROUTINE TO SORT GENES BY THEIR START-POINT:

	sub by_start
	{
	if (!($START{$a})) { print STDERR "ERROR: do not know start of gene a $a.\n"; exit;}
	if (!($START{$b})) { print STDERR "ERROR: do not know start of gene b $b.\n"; exit;}
	$START{$a} <=> $START{$b}
	or
	$a cmp $b;
	}

	#------------------------------------------------------------------