avrilcoghlan/check_if_adjacent_genes_are_paralogs.pl

## check_if_adjacent_genes_are_paralogs.pl
#!/usr/local/bin/perl

#
# Perl script check_if_adjacent_genes_are_paralogs.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 10-Dec-07.
#
# This perls scripts checks whether adjacent genes are paralogs for
# each gene in the ngasp benchmark gene set.
#
# The command-line format is:
# % perl <check_if_adjacent_genes_are_paralogs.pl> gff
# where gff is the input benchmark gff file.
#
#------------------------------------------------------------------#

# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

$num_args                  = $#ARGV + 1;
if ($num_args != 1)
{
   print "Usage of check_if_adjacent_genes_are_paralogs.pl\n\n";
   print "perl check_if_adjacent_genes_are_paralogs.pl <gff>\n";
   print "where <gff> is the input benchmark gff file.\n";
   print "For example, >perl check_if_adjacent_genes_are_paralogs.pl phase1_confirmed_isoforms_C.gff.1\n";
   exit;
}

# READ IN MY PERL MODULES:

BEGIN {
    unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
}

use lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_3may07/";
use Avril_modules;
use Treefam::DBConnection;
use DBI;

# FIND THE NAME OF THE INPUT GFF FILE:

$gff                       = $ARGV[0];

#------------------------------------------------------------------#

# FIND ALL PARALOGS OF C. ELEGANS GENES:


# GET A LIST OF ALL FAMILIES IN TREEFAM, INCLUDING TF1 (TREEFAM-A), TF3
# (TREEFAM-B) AND TF5 (TREEFAM-C) FAMILIES:

print STDERR "Finding TreeFam-A families... \n";
$dbh                       = DBI->connect("dbi:mysql:treefam_5:db.treefam.org:3308", 'anonymous', '') || return;
# GET ALL THE TREEFAM-A FAMILIES:
$table_w                   = "familyA";
$st                        = "SELECT AC from $table_w";
$sth                       = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv                        = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
   while ((@array) = $sth->fetchrow_array)
   {
      $AC                  = $array[0];
      @families            = (@families,$AC);
      if ($#families % 100 == 0) { print STDERR "Have $#families now ...\n"; }
   }
}

print STDERR "Finding TreeFam-B and TreeFam-C families... \n";
# GET ALL THE TREEFAM-B AND TREEFAM-C FAMILIES:
$table_w                   = "familyB";
$st                        = "SELECT AC from $table_w";
$sth                       = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv                        = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
   while ((@array) = $sth->fetchrow_array)
   {
      $AC                  = $array[0];
      if ($AC ne 'TF343806') # SOMETHING IS WRONG WITH THIS FAMILY IN TREEFAM-5. xxx
      {
         @families         = (@families,$AC);
      }
      if ($#families % 100 == 0) { print STDERR "Have $#families now ...\n"; }
   }
}

#------------------------------------------------------------------#

$dbc                       = Treefam::DBConnection->new();
%PARALOG                   = ();

# FIND PARALOGS WITHIN EACH FAMILY:
foreach my $treefam_family(@families)
{
   print STDERR "Looking at family $treefam_family... \n";
   $famh                   = $dbc->get_FamilyHandle();
   if (!(defined($famh->get_by_id($treefam_family))))
   {
      #xxx print "xxx there is no family $treefam_family...\n";
      goto NEXT_FAMILY;
   }
   $family                 = $famh->get_by_id($treefam_family);
   if (!(defined($family->ID())))
   {
      #xxx print "xxx do not have ID stored for family $treefam_family...\n";
      goto NEXT_FAMILY;
   }
   $AC                     = $family->ID();         # GET THE FAMILY ID.
   if ($AC ne $treefam_family) { print STDERR "ERROR: AC $AC treefam_family $treefam_family\n"; exit;}
   $no_families++;
   print "\n$no_families: reading family $AC...\n";
   # GET THE TREE FOR THE FAMILY:
   if (!(defined($family->get_tree('clean'))))
   {
      #xxx print "xxx AC $AC: there is no tree!\n";
      goto NEXT_FAMILY;
   }
   $tree                   = $family->get_tree('clean'); # GET THE TREEFAM CLEAN TREE.
   # GET ALL THE TRANSCRIPTS IN THIS FAMILY FROM C. ELEGANS:
   @species_nodes          = &Avril_modules::empty_array(@species_nodes);
   @species_genes          = &Avril_modules::empty_array(@species_genes);
   @nodes                  = $tree->get_leaves;
   foreach $node(@nodes)
   {
      # FIND THE SPECIES FOR THIS TRANSCRIPT:
      if (defined($node->get_tag_value('S')))
      {
         $species          = $node->get_tag_value('S');
      }
      else
      {
         #xxx print "xxx AC $AC: do not know species of node\n";
         $species          = "none";
      }
      $species             =~ tr/[a-z]/[A-Z]/;
      if ($species eq 'CAEEL') # IF IT IS A C. ELEGANS GENE.
      {
         @species_nodes    = (@species_nodes,$node);
         # FIND THE GENE FOR THIS TRANSCRIPT:
         $gh               = $node->gene;
         $gene             = $gh->ID(); # eg. ENSMUSG00000022770
         @species_genes    = (@species_genes,$gene);
      }
   }

   # FOR EACH PAIR OF C. ELEGANS GENES IN THE TREE, SEE
   # WHEN THE DUPLICATION OCCURRED:
   for ($i = 0; $i <= $#species_nodes; $i++)
   {
      $species_gene_i      = $species_genes[$i];
      for ($j = $i+1; $j <= $#species_nodes; $j++)
      {
         $species_gene_j   = $species_genes[$j];
         $pair1            = $species_gene_i."_".$species_gene_j;
         $pair2            = $species_gene_j."_".$species_gene_i;
         $PARALOG{$pair1}  = 1;
         $PARALOG{$pair2}  = 1;
      }
   }
   NEXT_FAMILY:
}

#------------------------------------------------------------------#

# READ IN THE INPUT GFF FILE:

print "GENE PREVGENE NEXTGENE\n";
%GENES                     = ();
%SEEN                      = ();
%START                     = ();
open(GFF,"$gff") || die "ERROR: cannot open $gff.\n";
while(<GFF>)
{
   $line                   = $_;
   chomp $line;
   @temp                   = split(/\t+/,$line);
   $chr                    = $temp[0];
   $start                  = $temp[3];
   $exon                   = $temp[8];
   # FIND THE GENE NAME:
   # eg. ID=CDS:Transcript:C49A1.4b.1;Name=C49A1.4b.1;Parent=Transcript:C49A1.4b.1
   @temp                   = split(/\;/,$exon);
   $gene                   = $temp[1]; # Name=C49A1.4b.1 THIS IS THE TRANSCRIPT NAME.
   @temp                   = split(/\./,$gene);
   $gene                   = $temp[0].".".$temp[1]; # eg. Name=C49A1.4b
   @temp                   = split(/=/,$gene);
   $gene                   = $temp[1]; # eg. C49A1.4b
   $gene                   =~ tr/[a-z]/[A-Z]/;
   if (substr($gene,length($gene)-1,1) =~ /[A-Z]/) { chop($gene);} # eg. C49A1.4 THIS IS THE GENE NAME.
   # RECORD THE GENE START:
   if (!($START{$gene}))              { $START{$gene} = $start;}
   else { if ($start < $START{$gene}) { $START{$gene} = $start;}}
   # RECORD THE GENES ON THIS CHROMOSOME:
   if (!($SEEN{$gene}))
   {
      $SEEN{$gene}         = 1;
      if (!($GENES{$chr})) { $GENES{$chr} = $gene;}
      else {$GENES{$chr} = $GENES{$chr}.",".$gene;}
   }
}
close(GFF);

# LOOK AT THE GENES ON EACH CHROMOSOME AT A TIME:
foreach $chr (keys %GENES)
{
   $genes                  = $GENES{$chr};
   @genes                  = split(/\,/,$genes);
   @genes                  = sort by_start @genes;
   $prev_gene              = "none";
   # FOR EACH GENE, FIND THE STRAND OF THE ADJACENT GENES:
   for ($i = 0; $i <= $#genes; $i++)
   {
      $gene                = $genes[$i];
      # CHECK IF THE PREVIOUS GENE IS A PARALOG OF THIS GENE:
      $pair1               = $gene."_".$prev_gene;
      $pair2               = $prev_gene."_".$gene;
      $prev_paralog        = 0;
      if ($PARALOG{$pair1}) { $prev_paralog = 1;}
      if ($PARALOG{$pair2}) { $prev_paralog = 1;}
      # CHECK IF THE NEXT GENE IS A PARALOG OF THIS GENE:
      $next_paralog        = 0;
      if ($i != $#genes)
      {
         $next_gene        = $genes[$i+1];
         $pair1            = $gene."_".$next_gene;
         $pair2            = $next_gene."_".$gene;
         if ($PARALOG{$pair1}) { $next_paralog = 1;}
         if ($PARALOG{$pair2}) { $next_paralog = 1;}
      }

      print "$gene $prev_paralog $next_paralog\n";
      $prev_gene           = $gene;
   }
}

#------------------------------------------------------------------#

print STDERR "FINISHED.\n";

#------------------------------------------------------------------#

# SUBROUTINE TO SORT EXONS BY START-POINT:

sub by_start
{
   if (!($START{$a})) { print STDERR "ERROR: do not know start of $a.\n"; exit;}
   if (!($START{$b})) { print STDERR "ERROR: do not know start of $b.\n"; exit;}
   $START{$a} <=> $START{$b}
      or
   $a cmp $b;
}

#------------------------------------------------------------------#
	#!/usr/local/bin/perl

	#
	# Perl script check_if_adjacent_genes_are_paralogs.pl
	# Written by Avril Coghlan (alc@sanger.ac.uk)
	# 10-Dec-07.
	#
	# This perls scripts checks whether adjacent genes are paralogs for
	# each gene in the ngasp benchmark gene set.
	#
	# The command-line format is:
	# % perl <check_if_adjacent_genes_are_paralogs.pl> gff
	# where gff is the input benchmark gff file.
	#
	#------------------------------------------------------------------#

	# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

	$num_args = $#ARGV + 1;
	if ($num_args != 1)
	{
	print "Usage of check_if_adjacent_genes_are_paralogs.pl\n\n";
	print "perl check_if_adjacent_genes_are_paralogs.pl <gff>\n";
	print "where <gff> is the input benchmark gff file.\n";
	print "For example, >perl check_if_adjacent_genes_are_paralogs.pl phase1_confirmed_isoforms_C.gff.1\n";
	exit;
	}

	# READ IN MY PERL MODULES:

	BEGIN {
	unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
	}

	use lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_3may07/";
	use Avril_modules;
	use Treefam::DBConnection;
	use DBI;

	# FIND THE NAME OF THE INPUT GFF FILE:

	$gff = $ARGV[0];

	#------------------------------------------------------------------#

	# FIND ALL PARALOGS OF C. ELEGANS GENES:



	# GET A LIST OF ALL FAMILIES IN TREEFAM, INCLUDING TF1 (TREEFAM-A), TF3
	# (TREEFAM-B) AND TF5 (TREEFAM-C) FAMILIES:

	print STDERR "Finding TreeFam-A families... \n";
	$dbh = DBI->connect("dbi:mysql:treefam_5:db.treefam.org:3308", 'anonymous', '') \|\| return;
	# GET ALL THE TREEFAM-A FAMILIES:
	$table_w = "familyA";
	$st = "SELECT AC from $table_w";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$AC = $array[0];
	@families = (@families,$AC);
	if ($#families % 100 == 0) { print STDERR "Have $#families now ...\n"; }
	}
	}

	print STDERR "Finding TreeFam-B and TreeFam-C families... \n";
	# GET ALL THE TREEFAM-B AND TREEFAM-C FAMILIES:
	$table_w = "familyB";
	$st = "SELECT AC from $table_w";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$AC = $array[0];
	if ($AC ne 'TF343806') # SOMETHING IS WRONG WITH THIS FAMILY IN TREEFAM-5. xxx
	{
	@families = (@families,$AC);
	}
	if ($#families % 100 == 0) { print STDERR "Have $#families now ...\n"; }
	}
	}

	#------------------------------------------------------------------#

	$dbc = Treefam::DBConnection->new();
	%PARALOG = ();

	# FIND PARALOGS WITHIN EACH FAMILY:
	foreach my $treefam_family(@families)
	{
	print STDERR "Looking at family $treefam_family... \n";
	$famh = $dbc->get_FamilyHandle();
	if (!(defined($famh->get_by_id($treefam_family))))
	{
	#xxx print "xxx there is no family $treefam_family...\n";
	goto NEXT_FAMILY;
	}
	$family = $famh->get_by_id($treefam_family);
	if (!(defined($family->ID())))
	{
	#xxx print "xxx do not have ID stored for family $treefam_family...\n";
	goto NEXT_FAMILY;
	}
	$AC = $family->ID(); # GET THE FAMILY ID.
	if ($AC ne $treefam_family) { print STDERR "ERROR: AC $AC treefam_family $treefam_family\n"; exit;}
	$no_families++;
	print "\n$no_families: reading family $AC...\n";
	# GET THE TREE FOR THE FAMILY:
	if (!(defined($family->get_tree('clean'))))
	{
	#xxx print "xxx AC $AC: there is no tree!\n";
	goto NEXT_FAMILY;
	}
	$tree = $family->get_tree('clean'); # GET THE TREEFAM CLEAN TREE.
	# GET ALL THE TRANSCRIPTS IN THIS FAMILY FROM C. ELEGANS:
	@species_nodes = &Avril_modules::empty_array(@species_nodes);
	@species_genes = &Avril_modules::empty_array(@species_genes);
	@nodes = $tree->get_leaves;
	foreach $node(@nodes)
	{
	# FIND THE SPECIES FOR THIS TRANSCRIPT:
	if (defined($node->get_tag_value('S')))
	{
	$species = $node->get_tag_value('S');
	}
	else
	{
	#xxx print "xxx AC $AC: do not know species of node\n";
	$species = "none";
	}
	$species =~ tr/[a-z]/[A-Z]/;
	if ($species eq 'CAEEL') # IF IT IS A C. ELEGANS GENE.
	{
	@species_nodes = (@species_nodes,$node);
	# FIND THE GENE FOR THIS TRANSCRIPT:
	$gh = $node->gene;
	$gene = $gh->ID(); # eg. ENSMUSG00000022770
	@species_genes = (@species_genes,$gene);
	}
	}

	# FOR EACH PAIR OF C. ELEGANS GENES IN THE TREE, SEE
	# WHEN THE DUPLICATION OCCURRED:
	for ($i = 0; $i <= $#species_nodes; $i++)
	{
	$species_gene_i = $species_genes[$i];
	for ($j = $i+1; $j <= $#species_nodes; $j++)
	{
	$species_gene_j = $species_genes[$j];
	$pair1 = $species_gene_i."_".$species_gene_j;
	$pair2 = $species_gene_j."_".$species_gene_i;
	$PARALOG{$pair1} = 1;
	$PARALOG{$pair2} = 1;
	}
	}
	NEXT_FAMILY:
	}

	#------------------------------------------------------------------#

	# READ IN THE INPUT GFF FILE:

	print "GENE PREVGENE NEXTGENE\n";
	%GENES = ();
	%SEEN = ();
	%START = ();
	open(GFF,"$gff") \|\| die "ERROR: cannot open $gff.\n";
	while(<GFF>)
	{
	$line = $_;
	chomp $line;
	@temp = split(/\t+/,$line);
	$chr = $temp[0];
	$start = $temp[3];
	$exon = $temp[8];
	# FIND THE GENE NAME:
	# eg. ID=CDS:Transcript:C49A1.4b.1;Name=C49A1.4b.1;Parent=Transcript:C49A1.4b.1
	@temp = split(/\;/,$exon);
	$gene = $temp[1]; # Name=C49A1.4b.1 THIS IS THE TRANSCRIPT NAME.
	@temp = split(/\./,$gene);
	$gene = $temp[0].".".$temp[1]; # eg. Name=C49A1.4b
	@temp = split(/=/,$gene);
	$gene = $temp[1]; # eg. C49A1.4b
	$gene =~ tr/[a-z]/[A-Z]/;
	if (substr($gene,length($gene)-1,1) =~ /[A-Z]/) { chop($gene);} # eg. C49A1.4 THIS IS THE GENE NAME.
	# RECORD THE GENE START:
	if (!($START{$gene})) { $START{$gene} = $start;}
	else { if ($start < $START{$gene}) { $START{$gene} = $start;}}
	# RECORD THE GENES ON THIS CHROMOSOME:
	if (!($SEEN{$gene}))
	{
	$SEEN{$gene} = 1;
	if (!($GENES{$chr})) { $GENES{$chr} = $gene;}
	else {$GENES{$chr} = $GENES{$chr}.",".$gene;}
	}
	}
	close(GFF);

	# LOOK AT THE GENES ON EACH CHROMOSOME AT A TIME:
	foreach $chr (keys %GENES)
	{
	$genes = $GENES{$chr};
	@genes = split(/\,/,$genes);
	@genes = sort by_start @genes;
	$prev_gene = "none";
	# FOR EACH GENE, FIND THE STRAND OF THE ADJACENT GENES:
	for ($i = 0; $i <= $#genes; $i++)
	{
	$gene = $genes[$i];
	# CHECK IF THE PREVIOUS GENE IS A PARALOG OF THIS GENE:
	$pair1 = $gene."_".$prev_gene;
	$pair2 = $prev_gene."_".$gene;
	$prev_paralog = 0;
	if ($PARALOG{$pair1}) { $prev_paralog = 1;}
	if ($PARALOG{$pair2}) { $prev_paralog = 1;}
	# CHECK IF THE NEXT GENE IS A PARALOG OF THIS GENE:
	$next_paralog = 0;
	if ($i != $#genes)
	{
	$next_gene = $genes[$i+1];
	$pair1 = $gene."_".$next_gene;
	$pair2 = $next_gene."_".$gene;
	if ($PARALOG{$pair1}) { $next_paralog = 1;}
	if ($PARALOG{$pair2}) { $next_paralog = 1;}
	}

	print "$gene $prev_paralog $next_paralog\n";
	$prev_gene = $gene;
	}
	}

	#------------------------------------------------------------------#

	print STDERR "FINISHED.\n";

	#------------------------------------------------------------------#

	# SUBROUTINE TO SORT EXONS BY START-POINT:

	sub by_start
	{
	if (!($START{$a})) { print STDERR "ERROR: do not know start of $a.\n"; exit;}
	if (!($START{$b})) { print STDERR "ERROR: do not know start of $b.\n"; exit;}
	$START{$a} <=> $START{$b}
	or
	$a cmp $b;
	}

	#------------------------------------------------------------------#