avrilcoghlan/get_chroms_from_treefam.pl

## get_chroms_from_treefam.pl
#!/usr/local/bin/perl

#
# Perl script get_chroms_from_treefam.pl
# Written by Avril Coghlan (a.coghlan@ucc.ie)
# 9-Sep-09.
#
# THis perl script reads in a list of 2Ce-to-1Cb orthologs and
# finds cases where the 2 Ce genes are on different chromosomes,
# with one on an autosome and one X-linked.
#
# The command-line format is:
# % perl <get_chroms_from_treefam.pl> <input> <release> <orthostrap_cutoff>
#   where <input> is the input file of 2Ce-to-1Cb orthologs,
#         <release> is the release of TreeFam to use,
#         <orthostrap_cutoff> is the orthostrap cutoff.
#
#------------------------------------------------------------------#

# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

$num_args                  = $#ARGV + 1;
if ($num_args != 4)
{
   print "Usage of get_chroms_from_treefam.pl\n\n";
   print "perl get_chroms_from_treefam.pl <input> <release> <species> <orthostrap_cutoff>\n";
   print "where <input> is the input file of 2Ce-to-1Cb orthologs,\n";
   print "      <release> is the release of TreeFam to use,\n";
   print "      <species> is the species containing the paralogs (eg. CAEEL),\n";
   print "      <orthostrap_cutoff> is the orthostrap cutoff.\n";
   print "For example, >perl get_chroms_from_treefam.pl CeCb_2to1_reliable_orths_9Sept09 7 CAEEL 70\n";
   exit;
}

# READ IN MY PERL MODULES:
use Avril_modules;
use Treefam::DBConnection;
use DBI;

# FIND THE NAME OF THE INPUT FILE, AND RELEASE OF TREEFAM TO USE:
$input                     = $ARGV[0];
$release                   = $ARGV[1];
$the_species               = $ARGV[2];
$orthostrap_cutoff         = $ARGV[3];

#------------------------------------------------------------------#

# GET A LIST OF ALL FULLY SEQUENCED TREEFAM SPECIES:
$database                  = 'treefam_'.$release;
%SEQUENCED                 = ();
%TAXID                     = ();
$num                       = 0;
$dbh                       = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return;
$table_w                   = 'species';
$st                        = "SELECT SWCODE, TAX_ID from $table_w WHERE FLAG=\"1\"";
$sth                       = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv                        = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
   while ((@array) = $sth->fetchrow_array)
   {
      $num++;
      $swcode              = $array[0]; # eg. CAEEL
      $taxid               = $array[1];
      $SEQUENCED{$swcode}  = 1;
      $TAXID{$swcode}      = $taxid;
   }
}
$rc                        = $dbh->disconnect();
$rc                        = "";
if (!($TAXID{$the_species})) { print STDERR "ERROR: do not know taxid for $the_species\n"; exit;}
print STDERR "Read in fully sequenced species...\n";

#------------------------------------------------------------------#

# FIND THE ID OF EACH TREEFAM GENE FROM SPECIES $the_species:
print STDERR "Now reading in all genes from $the_species...\n";

if (!($TAXID{$the_species})) { print STDERR "ERROR: do not know taxid for $the_species\n"; exit;}
$taxid1                    = $TAXID{$the_species};
print STDERR "taxid1 $taxid1 for $the_species\n";

# HASH TABLES TO REMEMBER THE SPECIES OF GENES:
$dbh                       = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return;

# SPECIFY THE TABLE:
$table_w                   = 'genes';

# THE FIRST COLUMN IN THIS TABLE IS AN IDENTIFIER (IDX),
# ANOTHER COLUMN IS THE TRANSCRIPT NAME, AND THE LAST COLUMN
# IS THE TAXONOMY ID, eg. 1 ENSANGT00000032162.1 7165

$st                        = "SELECT GID, TAX_ID, ID from $table_w WHERE TAX_ID=$taxid1";
$sth                       = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv                        = $sth->execute or die "Cannot execute the query: $sth->errstr";
%ID                        = ();

if ($rv >= 1)
{
   while ((@array) = $sth->fetchrow_array)    {
      $GID                 = $array[0];  # eg. T06E6.2a
      $TAXID               = $array[1];  # eg 7165
      $ID                  = $array[2];
      if ($TAXID{$the_species} eq $TAXID)
      {
         $ID{$GID}         = $ID;
      }
   }
}
$rc                        = $dbh->disconnect();
$rc                        = "";
print STDERR "Read in all genes from $the_species\n";

#------------------------------------------------------------------#

# READ IN ALL THE CHROMOSOMAL POSITIONS OF GENES FROM $the_species:
$dbh                   = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return;
open(INPUT,"$input") || die "ERROR: cannot open $input.\n";
while(<INPUT>)
{
   $line                  = $_;
   chomp $line;
   @temp                  = split(/\s+/,$line);
   $ce1                   = $temp[1];
   $ce2                   = $temp[6];
   $orthostrap1           = $temp[3];
   $orthostrap2           = $temp[8];
   @temp                  = split(/=/,$orthostrap1);
   $orthostrap1           = $temp[3];
   @temp                  = split(/=/,$orthostrap2);
   $orthostrap2           = $temp[3];
   if ($orthostrap1 >= $orthostrap_cutoff && $orthostrap2 >= $orthostrap_cutoff)
   {
      if (!($ID{$ce1})) { print STDERR "ERROR: do not know id for $ce1- $ID{$ce1}\n"; exit;}
      $ce1_id             = $ID{$ce1};
      if (!($ID{$ce1})) { print STDERR "ERROR: do not know id for $ce2-\n"; exit;}
      $ce2_id             = $ID{$ce2};
      $ce1_chrom          = "unknown";
      $ce2_chrom          = "unknown";

      # FIND THE CHROMOSOMAL POSITIONS OF $ce1 AND $ce2:
      $table_w            = 'map';
      $st                 = "SELECT ID, TARGET from $table_w WHERE (ID=\'$ce1_id\' OR ID=\'$ce2_id\')";
      $sth                = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
      $rv                 = $sth->execute or die "Cannot execute the query: $sth->errstr";
      if ($rv >= 1)
      {
         while ((@array) = $sth->fetchrow_array)
         {
            $the_id       = $array[0];
            $the_chrom    = $array[1];
            if    ($the_id eq $ce1_id) { $ce1_chrom = $the_chrom;}
            elsif ($the_id eq $ce2_id) { $ce2_chrom = $the_chrom;}
         }
      }
      if (($ce1_chrom eq 'chromosomeX' && $ce2_chrom ne 'chromosomeX') ||
          ($ce2_chrom eq 'chromosomeX' && $ce1_chrom ne 'chromosomeX'))
      {
         print "$ce1 $ce1_chrom $ce2 $ce2_chrom - $line\n";
      }
   }
}
close(INPUT);

$rc                        = $dbh->disconnect();
$rc                        = "";

#------------------------------------------------------------------#

print STDERR "FINISHED.\n";

#------------------------------------------------------------------#
	#!/usr/local/bin/perl

	#
	# Perl script get_chroms_from_treefam.pl
	# Written by Avril Coghlan (a.coghlan@ucc.ie)
	# 9-Sep-09.
	#
	# THis perl script reads in a list of 2Ce-to-1Cb orthologs and
	# finds cases where the 2 Ce genes are on different chromosomes,
	# with one on an autosome and one X-linked.
	#
	# The command-line format is:
	# % perl <get_chroms_from_treefam.pl> <input> <release> <orthostrap_cutoff>
	# where <input> is the input file of 2Ce-to-1Cb orthologs,
	# <release> is the release of TreeFam to use,
	# <orthostrap_cutoff> is the orthostrap cutoff.
	#
	#------------------------------------------------------------------#

	# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

	$num_args = $#ARGV + 1;
	if ($num_args != 4)
	{
	print "Usage of get_chroms_from_treefam.pl\n\n";
	print "perl get_chroms_from_treefam.pl <input> <release> <species> <orthostrap_cutoff>\n";
	print "where <input> is the input file of 2Ce-to-1Cb orthologs,\n";
	print " <release> is the release of TreeFam to use,\n";
	print " <species> is the species containing the paralogs (eg. CAEEL),\n";
	print " <orthostrap_cutoff> is the orthostrap cutoff.\n";
	print "For example, >perl get_chroms_from_treefam.pl CeCb_2to1_reliable_orths_9Sept09 7 CAEEL 70\n";
	exit;
	}

	# READ IN MY PERL MODULES:
	use Avril_modules;
	use Treefam::DBConnection;
	use DBI;

	# FIND THE NAME OF THE INPUT FILE, AND RELEASE OF TREEFAM TO USE:
	$input = $ARGV[0];
	$release = $ARGV[1];
	$the_species = $ARGV[2];
	$orthostrap_cutoff = $ARGV[3];

	#------------------------------------------------------------------#

	# GET A LIST OF ALL FULLY SEQUENCED TREEFAM SPECIES:
	$database = 'treefam_'.$release;
	%SEQUENCED = ();
	%TAXID = ();
	$num = 0;
	$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') \|\| return;
	$table_w = 'species';
	$st = "SELECT SWCODE, TAX_ID from $table_w WHERE FLAG=\"1\"";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$num++;
	$swcode = $array[0]; # eg. CAEEL
	$taxid = $array[1];
	$SEQUENCED{$swcode} = 1;
	$TAXID{$swcode} = $taxid;
	}
	}
	$rc = $dbh->disconnect();
	$rc = "";
	if (!($TAXID{$the_species})) { print STDERR "ERROR: do not know taxid for $the_species\n"; exit;}
	print STDERR "Read in fully sequenced species...\n";

	#------------------------------------------------------------------#

	# FIND THE ID OF EACH TREEFAM GENE FROM SPECIES $the_species:
	print STDERR "Now reading in all genes from $the_species...\n";

	if (!($TAXID{$the_species})) { print STDERR "ERROR: do not know taxid for $the_species\n"; exit;}
	$taxid1 = $TAXID{$the_species};
	print STDERR "taxid1 $taxid1 for $the_species\n";

	# HASH TABLES TO REMEMBER THE SPECIES OF GENES:
	$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') \|\| return;

	# SPECIFY THE TABLE:
	$table_w = 'genes';

	# THE FIRST COLUMN IN THIS TABLE IS AN IDENTIFIER (IDX),
	# ANOTHER COLUMN IS THE TRANSCRIPT NAME, AND THE LAST COLUMN
	# IS THE TAXONOMY ID, eg. 1 ENSANGT00000032162.1 7165

	$st = "SELECT GID, TAX_ID, ID from $table_w WHERE TAX_ID=$taxid1";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	%ID = ();

	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array) {
	$GID = $array[0]; # eg. T06E6.2a
	$TAXID = $array[1]; # eg 7165
	$ID = $array[2];
	if ($TAXID{$the_species} eq $TAXID)
	{
	$ID{$GID} = $ID;
	}
	}
	}
	$rc = $dbh->disconnect();
	$rc = "";
	print STDERR "Read in all genes from $the_species\n";

	#------------------------------------------------------------------#

	# READ IN ALL THE CHROMOSOMAL POSITIONS OF GENES FROM $the_species:
	$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') \|\| return;
	open(INPUT,"$input") \|\| die "ERROR: cannot open $input.\n";
	while(<INPUT>)
	{
	$line = $_;
	chomp $line;
	@temp = split(/\s+/,$line);
	$ce1 = $temp[1];
	$ce2 = $temp[6];
	$orthostrap1 = $temp[3];
	$orthostrap2 = $temp[8];
	@temp = split(/=/,$orthostrap1);
	$orthostrap1 = $temp[3];
	@temp = split(/=/,$orthostrap2);
	$orthostrap2 = $temp[3];
	if ($orthostrap1 >= $orthostrap_cutoff && $orthostrap2 >= $orthostrap_cutoff)
	{
	if (!($ID{$ce1})) { print STDERR "ERROR: do not know id for $ce1- $ID{$ce1}\n"; exit;}
	$ce1_id = $ID{$ce1};
	if (!($ID{$ce1})) { print STDERR "ERROR: do not know id for $ce2-\n"; exit;}
	$ce2_id = $ID{$ce2};
	$ce1_chrom = "unknown";
	$ce2_chrom = "unknown";

	# FIND THE CHROMOSOMAL POSITIONS OF $ce1 AND $ce2:
	$table_w = 'map';
	$st = "SELECT ID, TARGET from $table_w WHERE (ID=\'$ce1_id\' OR ID=\'$ce2_id\')";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$the_id = $array[0];
	$the_chrom = $array[1];
	if ($the_id eq $ce1_id) { $ce1_chrom = $the_chrom;}
	elsif ($the_id eq $ce2_id) { $ce2_chrom = $the_chrom;}
	}
	}
	if (($ce1_chrom eq 'chromosomeX' && $ce2_chrom ne 'chromosomeX') \|\|
	($ce2_chrom eq 'chromosomeX' && $ce1_chrom ne 'chromosomeX'))
	{
	print "$ce1 $ce1_chrom $ce2 $ce2_chrom - $line\n";
	}
	}
	}
	close(INPUT);

	$rc = $dbh->disconnect();
	$rc = "";

	#------------------------------------------------------------------#

	print STDERR "FINISHED.\n";

	#------------------------------------------------------------------#