Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 14:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5064891 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5064891 to your computer and use it in GitHub Desktop.
Perl script that retrieves all TreeFam trees, and infers orthologs between a pair of species based on the tree, by finding cases where the last common ancestor node of two genes from different species is a speciation node
#!/usr/local/bin/perl
#
# Perl script get_orthologs7.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 29-May-08.
# Edited 15-Jul-08.
#
# This perl script reads in all trees in TreeFam and finds all
# orthologs between two input species. This parses tree data rather
# than taking data from the mysql 'ortholog' table. This looks for
# cases where the last common ancestor node of a gene from species1
# and a gene from species2 is a speciation node. There can be more than
# one ortholog inferred from each tree.
#
# The command-line format is:
# % perl <get_orthologs7.pl> <species1> <species2> <version>
# where <species1> and <species2> are the UniProt names of the species,
# <version> is the version of TreeFam to use.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 3)
{
print "Usage of get_orthologs7.pl\n\n";
print "perl get_orthologs7.pl <species1> <species2> <version>\n";
print "where <species1> and <species2> are the UniProt names of the species,\n";
print " <version> is the version of TreeFam to use\n";
print "For example, >perl get_orthologs7.pl CAEEL CAEBR 6\n";
exit;
}
# READ IN MY PERL MODULES:
BEGIN {
unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
}
#xxxuse lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_3may07/"; # xxx edit to point to right place
use Avril_modules;
use Treefam::DBConnection;
use DBI;
# FIND THE TWO SPECIES THAT WE ARE INTERESTED IN:
$species1 = $ARGV[0];
$species2 = $ARGV[1];
# FIND THE VERSION OF TREEFAM TO USE:
$version = $ARGV[2];
$VERBOSE = 1;
#------------------------------------------------------------------#
# GET A LIST OF ALL FULLY SEQUENCED TREEFAM SPECIES:
$database = 'treefam_'.$version;
%SEQUENCED = ();
%TAXID = ();
$num = 0;
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return;
$table_w = 'species';
$st = "SELECT TAXNAME, SWCODE, TAX_ID from $table_w WHERE FLAG=\"1\"";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$num++;
$TAXNAME = $array[0];
$swcode = $array[1]; # eg. CAEEL
$taxid = $array[2];
if ($VERBOSE == 1) { print "$num: species $TAXNAME ($swcode), taxid $taxid\n";}
$SEQUENCED{$swcode} = 1;
$TAXID{$swcode} = $taxid;
}
}
$rc = $dbh->disconnect();
$rc = "";
if (!($SEQUENCED{$species1})) { print STDERR "ERROR: $species1 is not fully sequenced\n"; exit;}
if (!($SEQUENCED{$species2})) { print STDERR "ERROR: $species2 is not fully sequenced\n"; exit;}
if (!($TAXID{$species1})) { print STDERR "ERROR: do not know taxid for $species1\n"; exit;}
if (!($TAXID{$species2})) { print STDERR "ERROR: do not know taxid for $species2\n"; exit;}
print STDERR "Read in fully sequenced species...\n";
#------------------------------------------------------------------#
# GET A LIST OF ALL FAMILIES IN TREEFAM, INCLUDING TF1 (TREEFAM-A), TF3
# (TREEFAM-B) AND TF5 (TREEFAM-C) FAMILIES:
print STDERR "Finding TreeFam-A families... \n";
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return;
# GET ALL THE TREEFAM-A FAMILIES:
$table_w = "familyA"; # GET TREEFAM-A FAMILIES FIRST.
$st = "SELECT AC from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
@families = (@families,$AC);
if ($#families % 100 == 0) { print STDERR "Have $#families now ...\n"; }
}
}
print STDERR "Read in TreeFam-A families...\n";
print STDERR "Finding TreeFam-B and TreeFam-C families... \n";
# GET ALL THE TREEFAM-B AND TREEFAM-C FAMILIES:
$table_w = "familyB";
$st = "SELECT AC from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
# do 'select * from trees WHERE TREE LIKE '%null%' AND TYPE='CLEAN';' in mysql
if (!($AC eq 'TF343806' && $database eq 'treefam_5') &&
!($AC eq 'TF316363' && $database eq 'treefam_6') &&
!($AC eq 'TF317421' && $database eq 'treefam_6') &&
!($AC eq 'TF339446' && $database eq 'treefam_6') &&
!($AC eq 'TF343205' && $database eq 'treefam_6') &&
!($AC eq 'TF352105' && $database eq 'treefam_6') &&
!($AC eq 'TF352107' && $database eq 'treefam_6') &&
!($AC eq 'TF352717' && $database eq 'treefam_6') &&
!($AC eq 'TF317421' && $database eq 'treefam_6') &&
!($AC eq 'TF343522' && $database eq 'treefam_6') &&
!($AC eq 'TF351023' && $database eq 'treefam_6') &&
!($AC eq 'TF344027' && $database eq 'treefam_6') &&
!($AC eq 'TF342432' && $database eq 'treefam_6') &&
!($AC eq 'TF343833' && $database eq 'treefam_6') &&
!($AC eq 'TF340035' && $database eq 'treefam_6') &&
!($AC eq 'TF353739' && $database eq 'treefam_6') &&
!($AC eq 'TF352640' && $database eq 'treefam_6') &&
!($AC eq 'TF339446' && $database eq 'treefam_6') &&
!($AC eq 'TF352448' && $database eq 'treefam_6') &&
!($AC eq 'TF339749' && $database eq 'treefam_6') &&
!($AC eq 'TF351059' && $database eq 'treefam_6') &&
!($AC eq 'TF352160' && $database eq 'treefam_6') &&
!($AC eq 'TF352260' && $database eq 'treefam_6') &&
!($AC eq 'TF343262' && $database eq 'treefam_6') &&
!($AC eq 'TF316363' && $database eq 'treefam_6') &&
!($AC eq 'TF351567' && $database eq 'treefam_6') &&
!($AC eq 'TF343069' && $database eq 'treefam_6') &&
!($AC eq 'TF342972' && $database eq 'treefam_6') &&
!($AC eq 'TF352174' && $database eq 'treefam_6') &&
!($AC eq 'TF342976' && $database eq 'treefam_6') &&
!($AC eq 'TF351281' && $database eq 'treefam_6') &&
!($AC eq 'TF343087' && $database eq 'treefam_6') &&
!($AC eq 'TF352395' && $database eq 'treefam_6') &&
!($AC eq 'TF353497' && $database eq 'treefam_6'))
# SOMETHING IS WRONG WITH THIS FAMILY IN TREEFAM-5.
{
@families = (@families,$AC);
}
if ($#families % 100 == 0) { print STDERR "Have $#families now ...\n"; }
}
}
print STDERR "Read in TreeFam-B families...\n";
# READ IN TREEFAM-C FAMILIES, IF WE ARE LOOKING AT TREEFAM-5 OR -6:
if ($database ne 'treefam_4')
{
$table_w = "familyC";
$st = "SELECT AC from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
if (!($AC eq 'TF343806' && $database eq 'treefam_5') &&
!($AC eq 'TF316363' && $database eq 'treefam_6') &&
!($AC eq 'TF317421' && $database eq 'treefam_6') &&
!($AC eq 'TF339446' && $database eq 'treefam_6') &&
!($AC eq 'TF343205' && $database eq 'treefam_6') &&
!($AC eq 'TF352105' && $database eq 'treefam_6') &&
!($AC eq 'TF352107' && $database eq 'treefam_6') &&
!($AC eq 'TF352717' && $database eq 'treefam_6') &&
!($AC eq 'TF317421' && $database eq 'treefam_6') &&
!($AC eq 'TF343522' && $database eq 'treefam_6') &&
!($AC eq 'TF351023' && $database eq 'treefam_6') &&
!($AC eq 'TF344027' && $database eq 'treefam_6') &&
!($AC eq 'TF342432' && $database eq 'treefam_6') &&
!($AC eq 'TF343833' && $database eq 'treefam_6') &&
!($AC eq 'TF340035' && $database eq 'treefam_6') &&
!($AC eq 'TF353739' && $database eq 'treefam_6') &&
!($AC eq 'TF352640' && $database eq 'treefam_6') &&
!($AC eq 'TF339446' && $database eq 'treefam_6') &&
!($AC eq 'TF352448' && $database eq 'treefam_6') &&
!($AC eq 'TF339749' && $database eq 'treefam_6') &&
!($AC eq 'TF351059' && $database eq 'treefam_6') &&
!($AC eq 'TF352160' && $database eq 'treefam_6') &&
!($AC eq 'TF352260' && $database eq 'treefam_6') &&
!($AC eq 'TF343262' && $database eq 'treefam_6') &&
!($AC eq 'TF316363' && $database eq 'treefam_6') &&
!($AC eq 'TF351567' && $database eq 'treefam_6') &&
!($AC eq 'TF343069' && $database eq 'treefam_6') &&
!($AC eq 'TF342972' && $database eq 'treefam_6') &&
!($AC eq 'TF352174' && $database eq 'treefam_6') &&
!($AC eq 'TF342976' && $database eq 'treefam_6') &&
!($AC eq 'TF351281' && $database eq 'treefam_6') &&
!($AC eq 'TF343087' && $database eq 'treefam_6') &&
!($AC eq 'TF352395' && $database eq 'treefam_6') &&
!($AC eq 'TF353497' && $database eq 'treefam_6')) # SOMETHING IS WRONG WITH THIS FAMILY IN TREEFAM-5. x
{
@families = (@families,$AC);
}
if ($#families % 100 == 0) { print STDERR "Have $#families now ...\n"; }
}
}
print STDERR "Read in TreeFam-C families...\n";
}
#------------------------------------------------------------------#
# FIND THE SPECIES OF EACH TREEFAM GENE:
print STDERR "Now reading in all genes from $species1 and $species2...\n";
if (!($TAXID{$species1})) { print STDERR "ERROR: do not know taxid for $species1\n"; exit;}
$taxid1 = $TAXID{$species1};
if (!($TAXID{$species2})) { print STDERR "ERROR: do not know taxid for $species2\n"; exit;}
$taxid2 = $TAXID{$species2};
# HASH TABLES TO REMEMBER THE SPECIES OF GENES:
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return;
# SPECIFY THE TABLE:
$table_w = 'genes';
# THE FIRST COLUMN IN THIS TABLE IS AN IDENTIFIER (IDX),
# ANOTHER COLUMN IS THE TRANSCRIPT NAME, AND THE LAST COLUMN
# IS THE TAXONOMY ID, eg. 1 ENSANGT00000032162.1 7165
$st = "SELECT IDX, GID, TAX_ID from $table_w WHERE (TAX_ID=$taxid1 || TAX_ID=$taxid2)";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
%SPECIES1_ID = ();
%SPECIES2_ID = ();
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array) {
$IDX = $array[0]; # eg., 1
$GID = $array[1]; # eg. T06E6.2a
$TAXID = $array[2]; # eg 7165
if ($TAXID{$species1} eq $TAXID || $TAXID{$species2} eq $TAXID)
{
if ($TAXID{$species1} eq $TAXID)
{
$SPECIES1_ID{$IDX} = $GID;
}
elsif ($TAXID{$species2} eq $TAXID)
{
$SPECIES2_ID{$IDX} = $GID;
}
else { print STDERR "ERROR: taxid $TAXID $species1 $species2\n"; exit;}
}
}
}
$rc = $dbh->disconnect();
$rc = "";
print STDERR "Read in all genes from $species1 and $species2...\n";
#------------------------------------------------------------------#
# READ IN ALL THE ORTHOLOGY BOOTSTRAP VALUES FROM THE 'orthologs' TABLE:
# NOW FIND ALL $species1-$species2 ORTHOLOGS:
print STDERR "Reading orthology bootstrap values for $species1 and $species2...\n";
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return;
$table_w = 'ortholog';
# THE COLUMNS IN THE TABLE ARE IDX1, IDX2, TAXON_ID, BOOTSTRAP:
%ORTHOSTRAP = ();
$st = "SELECT idx1, idx2, taxon_id, bootstrap from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array) {
$IDX1 = $array[0];
$IDX2 = $array[1];
$TAXON_ID = $array[2];
$BOOTSTRAP = $array[3];
if (($SPECIES1_ID{$IDX1} && $SPECIES2_ID{$IDX2}) ||
($SPECIES2_ID{$IDX1} && $SPECIES1_ID{$IDX2}))
{
if ($SPECIES1_ID{$IDX1} && $SPECIES2_ID{$IDX2})
{
$ID1 = $SPECIES1_ID{$IDX1};
$ID2 = $SPECIES2_ID{$IDX2};
}
elsif ($SPECIES1_ID{$IDX2} && $SPECIES2_ID{$IDX1})
{
$ID1 = $SPECIES1_ID{$IDX2};
$ID2 = $SPECIES2_ID{$IDX1};
}
else { print STDERR "ERROR: idx1 $IDX1 idx2 $IDX2 taxon_id $TAXON_ID\n"; exit;}
$pair1 = $ID1."_".$ID2;
$pair2 = $ID2."_".$ID1;
if ($ORTHOSTRAP{$pair1})
{
# THIS SHOULDN'T REALLY HAPPEN BUT IN TREEFAM-4, HAD SEVERAL TRANSCRIPTS OF ONE GENE IN A TREE:
if ($ORTHOSTRAP{$pair1} ne $BOOTSTRAP)
{
if ($BOOTSTRAP < $ORTHOSTRAP{$pair1})
{
$BOOTSTRAP = $ORTHOSTRAP{$pair1};
}
}
}
if ($ORTHOSTRAP{$pair2})
{
# THIS SHOULDN'T REALLY HAPPEN BUT IN TREEFAM-4, HAD SEVERAL TRANSCRIPTS OF ONE GENE IN A TREE:
if ($ORTHOSTRAP{$pair2} ne $BOOTSTRAP)
{
if ($BOOTSTRAP < $ORTHOSTRAP{$pair2})
{
$BOOTSTRAP = $ORTHOSTRAP{$pair2};
}
}
}
$ORTHOSTRAP{$pair1} = $BOOTSTRAP;
$ORTHOSTRAP{$pair2} = $BOOTSTRAP;
}
}
}
$rc = $dbh->disconnect();
$rc = "";
print STDERR "Read in orthology bootstrap values for $species1 and $species2 genes\n";
#------------------------------------------------------------------#
print STDERR "Reading in trees now...\n";
$dbc = Treefam::DBConnection->new();
# FIND ORTHOLOGS WITHIN EACH FAMILY:
foreach my $treefam_family(@families)
{
print STDERR "Looking at family $treefam_family... \n";
$famh = $dbc->get_FamilyHandle();
if (!(defined($famh->get_by_id($treefam_family))))
{
print "WARNING: there is no family $treefam_family...\n";
goto NEXT_FAMILY;
}
$family = $famh->get_by_id($treefam_family);
if (!(defined($family->ID())))
{
print "WARNING: do not have ID stored for family $treefam_family...\n";
goto NEXT_FAMILY;
}
$AC = $family->ID(); # GET THE FAMILY ID.
if ($AC ne $treefam_family) { print STDERR "ERROR: AC $AC treefam_family $treefam_family\n"; exit;}
$no_families++;
print "\n$no_families: reading family $AC...\n";
# GET THE TREE FOR THE FAMILY:
if (!(defined($family->get_tree('clean'))))
#xxxif (!(defined($family->get_tree('full'))))
{
print "WARNING: AC $AC: there is no tree!\n";
goto NEXT_FAMILY;
}
$tree = $family->get_tree('clean'); # GET THE TREEFAM CLEAN TREE.
#xxx$tree = $family->get_tree('full');
# GET ALL THE TRANSCRIPTS IN THIS FAMILY FROM SPECIES $the_species:
@species1_nodes = &Avril_modules::empty_array(@species1_nodes);
@species2_nodes = &Avril_modules::empty_array(@species2_nodes);
@species1_genes = &Avril_modules::empty_array(@species1_genes);
@species2_genes = &Avril_modules::empty_array(@species2_genes);
if (!(defined($tree->get_leaves)))
{
print "WARNING: AC $AC: no leaves in tree!\n";
goto NEXT_FAMILY;
}
@nodes = $tree->get_leaves;
foreach $node(@nodes)
{
# FIND THE SPECIES FOR THIS TRANSCRIPT:
if (defined($node->get_tag_value('S')))
{
$species = $node->get_tag_value('S');
}
else
{
print "WARNING: AC $AC: do not know species of node\n";
$species = "none";
}
$species =~ tr/[a-z]/[A-Z]/;
if ($species eq $species1)
{
@species1_nodes = (@species1_nodes,$node);
# FIND THE GENE FOR THIS TRANSCRIPT:
$gh = $node->gene;
$gene = $gh->ID(); # eg. ENSMUSG00000022770
if (substr($gene,0,3 eq 'WBG')) { $gene = $node->sequence_id();}
@species1_genes = (@species1_genes,$gene);
}
if ($species eq $species2)
{
@species2_nodes = (@species2_nodes,$node);
# FIND THE GENE FOR THIS TRANSCRIPT:
$gh = $node->gene;
$gene = $gh->ID(); # eg. ENSMUSG00000022770
if (substr($gene,0,3 eq 'WBG')) { $gene = $node->sequence_id();}
@species2_genes = (@species2_genes,$gene);
}
}
# FOR EACH PAIR OF C. ELEGANS AND C. BRIGGSAE GENES IN THE TREE, SEE
# IF THEY ORIGINATED BY A SPECIATION EVENT (ARE ORTHOLOGS):
for ($i = 0; $i <= $#species1_nodes; $i++)
{
$species1_node = $species1_nodes[$i];
$species1_gene = $species1_genes[$i];
for ($j = 0; $j <= $#species2_nodes; $j++)
{
$species2_node = $species2_nodes[$j];
$species2_gene = $species2_genes[$j];
# FIND THE LAST COMMON ANCESTOR NODE OF $species1_node AND $species2_node:
$lca = $tree->get_last_common_ancestor($species1_node,$species2_node);
# FIND THE TAXONOMIC LEVEL OF THE LAST COMMON ANCESTOR NODE:
if (defined($lca->get_tag_value('S')))
{
$species = $lca->get_tag_value('S');
}
else
{
$species = 'none';
print "WARNING: $AC: do not know species of lca\n";
}
$species =~ tr/[a-z]/[A-Z]/;
# GET THE BOOTSTRAP OF THE LAST COMMON ANCESTOR NODE:
if (defined($lca->get_tag_value('B'))) { $bootstrap = $lca->get_tag_value('B');}
else { $bootstrap = 0; }
# CHECK IF THE $lca NODE IS A SPECIATION NODE:
if (defined($lca->get_tag_value('D')))
{
$lca_type = $lca->get_tag_value('D');
}
else
{
print "WARNING: AC $AC: do not know lca type!\n";
$lca_type = "none";
}
if ($lca_type eq 'N') # N IS A SPECIATION NODE.
{
# FIND THE ORTHOLOGY BOOTSTRAP (IF DEFINED) FOR $species1_gene AND $species2_gene:
$pair = $species1_gene."_".$species2_gene;
if ($ORTHOSTRAP{$pair})
{
$orthostrap = $ORTHOSTRAP{$pair};
}
else
{
$orthostrap = "NA";
}
print "family $AC species1_gene $species1_gene species2_gene $species2_gene bootstrap=$bootstrap orthostrap=$orthostrap\n";
}
}
}
NEXT_FAMILY:
}
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment