Created
March 1, 2013 14:10
-
-
Save avrilcoghlan/5064891 to your computer and use it in GitHub Desktop.
Perl script that retrieves all TreeFam trees, and infers orthologs between a pair of species based on the tree, by finding cases where the last common ancestor node of two genes from different species is a speciation node
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script get_orthologs7.pl | |
# Written by Avril Coghlan (alc@sanger.ac.uk) | |
# 29-May-08. | |
# Edited 15-Jul-08. | |
# | |
# This perl script reads in all trees in TreeFam and finds all | |
# orthologs between two input species. This parses tree data rather | |
# than taking data from the mysql 'ortholog' table. This looks for | |
# cases where the last common ancestor node of a gene from species1 | |
# and a gene from species2 is a speciation node. There can be more than | |
# one ortholog inferred from each tree. | |
# | |
# The command-line format is: | |
# % perl <get_orthologs7.pl> <species1> <species2> <version> | |
# where <species1> and <species2> are the UniProt names of the species, | |
# <version> is the version of TreeFam to use. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 3) | |
{ | |
print "Usage of get_orthologs7.pl\n\n"; | |
print "perl get_orthologs7.pl <species1> <species2> <version>\n"; | |
print "where <species1> and <species2> are the UniProt names of the species,\n"; | |
print " <version> is the version of TreeFam to use\n"; | |
print "For example, >perl get_orthologs7.pl CAEEL CAEBR 6\n"; | |
exit; | |
} | |
# READ IN MY PERL MODULES: | |
BEGIN { | |
unshift (@INC, '/nfs/team71/phd/alc/perl/modules'); | |
} | |
#xxxuse lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_3may07/"; # xxx edit to point to right place | |
use Avril_modules; | |
use Treefam::DBConnection; | |
use DBI; | |
# FIND THE TWO SPECIES THAT WE ARE INTERESTED IN: | |
$species1 = $ARGV[0]; | |
$species2 = $ARGV[1]; | |
# FIND THE VERSION OF TREEFAM TO USE: | |
$version = $ARGV[2]; | |
$VERBOSE = 1; | |
#------------------------------------------------------------------# | |
# GET A LIST OF ALL FULLY SEQUENCED TREEFAM SPECIES: | |
$database = 'treefam_'.$version; | |
%SEQUENCED = (); | |
%TAXID = (); | |
$num = 0; | |
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return; | |
$table_w = 'species'; | |
$st = "SELECT TAXNAME, SWCODE, TAX_ID from $table_w WHERE FLAG=\"1\""; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$num++; | |
$TAXNAME = $array[0]; | |
$swcode = $array[1]; # eg. CAEEL | |
$taxid = $array[2]; | |
if ($VERBOSE == 1) { print "$num: species $TAXNAME ($swcode), taxid $taxid\n";} | |
$SEQUENCED{$swcode} = 1; | |
$TAXID{$swcode} = $taxid; | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
if (!($SEQUENCED{$species1})) { print STDERR "ERROR: $species1 is not fully sequenced\n"; exit;} | |
if (!($SEQUENCED{$species2})) { print STDERR "ERROR: $species2 is not fully sequenced\n"; exit;} | |
if (!($TAXID{$species1})) { print STDERR "ERROR: do not know taxid for $species1\n"; exit;} | |
if (!($TAXID{$species2})) { print STDERR "ERROR: do not know taxid for $species2\n"; exit;} | |
print STDERR "Read in fully sequenced species...\n"; | |
#------------------------------------------------------------------# | |
# GET A LIST OF ALL FAMILIES IN TREEFAM, INCLUDING TF1 (TREEFAM-A), TF3 | |
# (TREEFAM-B) AND TF5 (TREEFAM-C) FAMILIES: | |
print STDERR "Finding TreeFam-A families... \n"; | |
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return; | |
# GET ALL THE TREEFAM-A FAMILIES: | |
$table_w = "familyA"; # GET TREEFAM-A FAMILIES FIRST. | |
$st = "SELECT AC from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
@families = (@families,$AC); | |
if ($#families % 100 == 0) { print STDERR "Have $#families now ...\n"; } | |
} | |
} | |
print STDERR "Read in TreeFam-A families...\n"; | |
print STDERR "Finding TreeFam-B and TreeFam-C families... \n"; | |
# GET ALL THE TREEFAM-B AND TREEFAM-C FAMILIES: | |
$table_w = "familyB"; | |
$st = "SELECT AC from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
# do 'select * from trees WHERE TREE LIKE '%null%' AND TYPE='CLEAN';' in mysql | |
if (!($AC eq 'TF343806' && $database eq 'treefam_5') && | |
!($AC eq 'TF316363' && $database eq 'treefam_6') && | |
!($AC eq 'TF317421' && $database eq 'treefam_6') && | |
!($AC eq 'TF339446' && $database eq 'treefam_6') && | |
!($AC eq 'TF343205' && $database eq 'treefam_6') && | |
!($AC eq 'TF352105' && $database eq 'treefam_6') && | |
!($AC eq 'TF352107' && $database eq 'treefam_6') && | |
!($AC eq 'TF352717' && $database eq 'treefam_6') && | |
!($AC eq 'TF317421' && $database eq 'treefam_6') && | |
!($AC eq 'TF343522' && $database eq 'treefam_6') && | |
!($AC eq 'TF351023' && $database eq 'treefam_6') && | |
!($AC eq 'TF344027' && $database eq 'treefam_6') && | |
!($AC eq 'TF342432' && $database eq 'treefam_6') && | |
!($AC eq 'TF343833' && $database eq 'treefam_6') && | |
!($AC eq 'TF340035' && $database eq 'treefam_6') && | |
!($AC eq 'TF353739' && $database eq 'treefam_6') && | |
!($AC eq 'TF352640' && $database eq 'treefam_6') && | |
!($AC eq 'TF339446' && $database eq 'treefam_6') && | |
!($AC eq 'TF352448' && $database eq 'treefam_6') && | |
!($AC eq 'TF339749' && $database eq 'treefam_6') && | |
!($AC eq 'TF351059' && $database eq 'treefam_6') && | |
!($AC eq 'TF352160' && $database eq 'treefam_6') && | |
!($AC eq 'TF352260' && $database eq 'treefam_6') && | |
!($AC eq 'TF343262' && $database eq 'treefam_6') && | |
!($AC eq 'TF316363' && $database eq 'treefam_6') && | |
!($AC eq 'TF351567' && $database eq 'treefam_6') && | |
!($AC eq 'TF343069' && $database eq 'treefam_6') && | |
!($AC eq 'TF342972' && $database eq 'treefam_6') && | |
!($AC eq 'TF352174' && $database eq 'treefam_6') && | |
!($AC eq 'TF342976' && $database eq 'treefam_6') && | |
!($AC eq 'TF351281' && $database eq 'treefam_6') && | |
!($AC eq 'TF343087' && $database eq 'treefam_6') && | |
!($AC eq 'TF352395' && $database eq 'treefam_6') && | |
!($AC eq 'TF353497' && $database eq 'treefam_6')) | |
# SOMETHING IS WRONG WITH THIS FAMILY IN TREEFAM-5. | |
{ | |
@families = (@families,$AC); | |
} | |
if ($#families % 100 == 0) { print STDERR "Have $#families now ...\n"; } | |
} | |
} | |
print STDERR "Read in TreeFam-B families...\n"; | |
# READ IN TREEFAM-C FAMILIES, IF WE ARE LOOKING AT TREEFAM-5 OR -6: | |
if ($database ne 'treefam_4') | |
{ | |
$table_w = "familyC"; | |
$st = "SELECT AC from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
if (!($AC eq 'TF343806' && $database eq 'treefam_5') && | |
!($AC eq 'TF316363' && $database eq 'treefam_6') && | |
!($AC eq 'TF317421' && $database eq 'treefam_6') && | |
!($AC eq 'TF339446' && $database eq 'treefam_6') && | |
!($AC eq 'TF343205' && $database eq 'treefam_6') && | |
!($AC eq 'TF352105' && $database eq 'treefam_6') && | |
!($AC eq 'TF352107' && $database eq 'treefam_6') && | |
!($AC eq 'TF352717' && $database eq 'treefam_6') && | |
!($AC eq 'TF317421' && $database eq 'treefam_6') && | |
!($AC eq 'TF343522' && $database eq 'treefam_6') && | |
!($AC eq 'TF351023' && $database eq 'treefam_6') && | |
!($AC eq 'TF344027' && $database eq 'treefam_6') && | |
!($AC eq 'TF342432' && $database eq 'treefam_6') && | |
!($AC eq 'TF343833' && $database eq 'treefam_6') && | |
!($AC eq 'TF340035' && $database eq 'treefam_6') && | |
!($AC eq 'TF353739' && $database eq 'treefam_6') && | |
!($AC eq 'TF352640' && $database eq 'treefam_6') && | |
!($AC eq 'TF339446' && $database eq 'treefam_6') && | |
!($AC eq 'TF352448' && $database eq 'treefam_6') && | |
!($AC eq 'TF339749' && $database eq 'treefam_6') && | |
!($AC eq 'TF351059' && $database eq 'treefam_6') && | |
!($AC eq 'TF352160' && $database eq 'treefam_6') && | |
!($AC eq 'TF352260' && $database eq 'treefam_6') && | |
!($AC eq 'TF343262' && $database eq 'treefam_6') && | |
!($AC eq 'TF316363' && $database eq 'treefam_6') && | |
!($AC eq 'TF351567' && $database eq 'treefam_6') && | |
!($AC eq 'TF343069' && $database eq 'treefam_6') && | |
!($AC eq 'TF342972' && $database eq 'treefam_6') && | |
!($AC eq 'TF352174' && $database eq 'treefam_6') && | |
!($AC eq 'TF342976' && $database eq 'treefam_6') && | |
!($AC eq 'TF351281' && $database eq 'treefam_6') && | |
!($AC eq 'TF343087' && $database eq 'treefam_6') && | |
!($AC eq 'TF352395' && $database eq 'treefam_6') && | |
!($AC eq 'TF353497' && $database eq 'treefam_6')) # SOMETHING IS WRONG WITH THIS FAMILY IN TREEFAM-5. x | |
{ | |
@families = (@families,$AC); | |
} | |
if ($#families % 100 == 0) { print STDERR "Have $#families now ...\n"; } | |
} | |
} | |
print STDERR "Read in TreeFam-C families...\n"; | |
} | |
#------------------------------------------------------------------# | |
# FIND THE SPECIES OF EACH TREEFAM GENE: | |
print STDERR "Now reading in all genes from $species1 and $species2...\n"; | |
if (!($TAXID{$species1})) { print STDERR "ERROR: do not know taxid for $species1\n"; exit;} | |
$taxid1 = $TAXID{$species1}; | |
if (!($TAXID{$species2})) { print STDERR "ERROR: do not know taxid for $species2\n"; exit;} | |
$taxid2 = $TAXID{$species2}; | |
# HASH TABLES TO REMEMBER THE SPECIES OF GENES: | |
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return; | |
# SPECIFY THE TABLE: | |
$table_w = 'genes'; | |
# THE FIRST COLUMN IN THIS TABLE IS AN IDENTIFIER (IDX), | |
# ANOTHER COLUMN IS THE TRANSCRIPT NAME, AND THE LAST COLUMN | |
# IS THE TAXONOMY ID, eg. 1 ENSANGT00000032162.1 7165 | |
$st = "SELECT IDX, GID, TAX_ID from $table_w WHERE (TAX_ID=$taxid1 || TAX_ID=$taxid2)"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
%SPECIES1_ID = (); | |
%SPECIES2_ID = (); | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) { | |
$IDX = $array[0]; # eg., 1 | |
$GID = $array[1]; # eg. T06E6.2a | |
$TAXID = $array[2]; # eg 7165 | |
if ($TAXID{$species1} eq $TAXID || $TAXID{$species2} eq $TAXID) | |
{ | |
if ($TAXID{$species1} eq $TAXID) | |
{ | |
$SPECIES1_ID{$IDX} = $GID; | |
} | |
elsif ($TAXID{$species2} eq $TAXID) | |
{ | |
$SPECIES2_ID{$IDX} = $GID; | |
} | |
else { print STDERR "ERROR: taxid $TAXID $species1 $species2\n"; exit;} | |
} | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
print STDERR "Read in all genes from $species1 and $species2...\n"; | |
#------------------------------------------------------------------# | |
# READ IN ALL THE ORTHOLOGY BOOTSTRAP VALUES FROM THE 'orthologs' TABLE: | |
# NOW FIND ALL $species1-$species2 ORTHOLOGS: | |
print STDERR "Reading orthology bootstrap values for $species1 and $species2...\n"; | |
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return; | |
$table_w = 'ortholog'; | |
# THE COLUMNS IN THE TABLE ARE IDX1, IDX2, TAXON_ID, BOOTSTRAP: | |
%ORTHOSTRAP = (); | |
$st = "SELECT idx1, idx2, taxon_id, bootstrap from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) { | |
$IDX1 = $array[0]; | |
$IDX2 = $array[1]; | |
$TAXON_ID = $array[2]; | |
$BOOTSTRAP = $array[3]; | |
if (($SPECIES1_ID{$IDX1} && $SPECIES2_ID{$IDX2}) || | |
($SPECIES2_ID{$IDX1} && $SPECIES1_ID{$IDX2})) | |
{ | |
if ($SPECIES1_ID{$IDX1} && $SPECIES2_ID{$IDX2}) | |
{ | |
$ID1 = $SPECIES1_ID{$IDX1}; | |
$ID2 = $SPECIES2_ID{$IDX2}; | |
} | |
elsif ($SPECIES1_ID{$IDX2} && $SPECIES2_ID{$IDX1}) | |
{ | |
$ID1 = $SPECIES1_ID{$IDX2}; | |
$ID2 = $SPECIES2_ID{$IDX1}; | |
} | |
else { print STDERR "ERROR: idx1 $IDX1 idx2 $IDX2 taxon_id $TAXON_ID\n"; exit;} | |
$pair1 = $ID1."_".$ID2; | |
$pair2 = $ID2."_".$ID1; | |
if ($ORTHOSTRAP{$pair1}) | |
{ | |
# THIS SHOULDN'T REALLY HAPPEN BUT IN TREEFAM-4, HAD SEVERAL TRANSCRIPTS OF ONE GENE IN A TREE: | |
if ($ORTHOSTRAP{$pair1} ne $BOOTSTRAP) | |
{ | |
if ($BOOTSTRAP < $ORTHOSTRAP{$pair1}) | |
{ | |
$BOOTSTRAP = $ORTHOSTRAP{$pair1}; | |
} | |
} | |
} | |
if ($ORTHOSTRAP{$pair2}) | |
{ | |
# THIS SHOULDN'T REALLY HAPPEN BUT IN TREEFAM-4, HAD SEVERAL TRANSCRIPTS OF ONE GENE IN A TREE: | |
if ($ORTHOSTRAP{$pair2} ne $BOOTSTRAP) | |
{ | |
if ($BOOTSTRAP < $ORTHOSTRAP{$pair2}) | |
{ | |
$BOOTSTRAP = $ORTHOSTRAP{$pair2}; | |
} | |
} | |
} | |
$ORTHOSTRAP{$pair1} = $BOOTSTRAP; | |
$ORTHOSTRAP{$pair2} = $BOOTSTRAP; | |
} | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
print STDERR "Read in orthology bootstrap values for $species1 and $species2 genes\n"; | |
#------------------------------------------------------------------# | |
print STDERR "Reading in trees now...\n"; | |
$dbc = Treefam::DBConnection->new(); | |
# FIND ORTHOLOGS WITHIN EACH FAMILY: | |
foreach my $treefam_family(@families) | |
{ | |
print STDERR "Looking at family $treefam_family... \n"; | |
$famh = $dbc->get_FamilyHandle(); | |
if (!(defined($famh->get_by_id($treefam_family)))) | |
{ | |
print "WARNING: there is no family $treefam_family...\n"; | |
goto NEXT_FAMILY; | |
} | |
$family = $famh->get_by_id($treefam_family); | |
if (!(defined($family->ID()))) | |
{ | |
print "WARNING: do not have ID stored for family $treefam_family...\n"; | |
goto NEXT_FAMILY; | |
} | |
$AC = $family->ID(); # GET THE FAMILY ID. | |
if ($AC ne $treefam_family) { print STDERR "ERROR: AC $AC treefam_family $treefam_family\n"; exit;} | |
$no_families++; | |
print "\n$no_families: reading family $AC...\n"; | |
# GET THE TREE FOR THE FAMILY: | |
if (!(defined($family->get_tree('clean')))) | |
#xxxif (!(defined($family->get_tree('full')))) | |
{ | |
print "WARNING: AC $AC: there is no tree!\n"; | |
goto NEXT_FAMILY; | |
} | |
$tree = $family->get_tree('clean'); # GET THE TREEFAM CLEAN TREE. | |
#xxx$tree = $family->get_tree('full'); | |
# GET ALL THE TRANSCRIPTS IN THIS FAMILY FROM SPECIES $the_species: | |
@species1_nodes = &Avril_modules::empty_array(@species1_nodes); | |
@species2_nodes = &Avril_modules::empty_array(@species2_nodes); | |
@species1_genes = &Avril_modules::empty_array(@species1_genes); | |
@species2_genes = &Avril_modules::empty_array(@species2_genes); | |
if (!(defined($tree->get_leaves))) | |
{ | |
print "WARNING: AC $AC: no leaves in tree!\n"; | |
goto NEXT_FAMILY; | |
} | |
@nodes = $tree->get_leaves; | |
foreach $node(@nodes) | |
{ | |
# FIND THE SPECIES FOR THIS TRANSCRIPT: | |
if (defined($node->get_tag_value('S'))) | |
{ | |
$species = $node->get_tag_value('S'); | |
} | |
else | |
{ | |
print "WARNING: AC $AC: do not know species of node\n"; | |
$species = "none"; | |
} | |
$species =~ tr/[a-z]/[A-Z]/; | |
if ($species eq $species1) | |
{ | |
@species1_nodes = (@species1_nodes,$node); | |
# FIND THE GENE FOR THIS TRANSCRIPT: | |
$gh = $node->gene; | |
$gene = $gh->ID(); # eg. ENSMUSG00000022770 | |
if (substr($gene,0,3 eq 'WBG')) { $gene = $node->sequence_id();} | |
@species1_genes = (@species1_genes,$gene); | |
} | |
if ($species eq $species2) | |
{ | |
@species2_nodes = (@species2_nodes,$node); | |
# FIND THE GENE FOR THIS TRANSCRIPT: | |
$gh = $node->gene; | |
$gene = $gh->ID(); # eg. ENSMUSG00000022770 | |
if (substr($gene,0,3 eq 'WBG')) { $gene = $node->sequence_id();} | |
@species2_genes = (@species2_genes,$gene); | |
} | |
} | |
# FOR EACH PAIR OF C. ELEGANS AND C. BRIGGSAE GENES IN THE TREE, SEE | |
# IF THEY ORIGINATED BY A SPECIATION EVENT (ARE ORTHOLOGS): | |
for ($i = 0; $i <= $#species1_nodes; $i++) | |
{ | |
$species1_node = $species1_nodes[$i]; | |
$species1_gene = $species1_genes[$i]; | |
for ($j = 0; $j <= $#species2_nodes; $j++) | |
{ | |
$species2_node = $species2_nodes[$j]; | |
$species2_gene = $species2_genes[$j]; | |
# FIND THE LAST COMMON ANCESTOR NODE OF $species1_node AND $species2_node: | |
$lca = $tree->get_last_common_ancestor($species1_node,$species2_node); | |
# FIND THE TAXONOMIC LEVEL OF THE LAST COMMON ANCESTOR NODE: | |
if (defined($lca->get_tag_value('S'))) | |
{ | |
$species = $lca->get_tag_value('S'); | |
} | |
else | |
{ | |
$species = 'none'; | |
print "WARNING: $AC: do not know species of lca\n"; | |
} | |
$species =~ tr/[a-z]/[A-Z]/; | |
# GET THE BOOTSTRAP OF THE LAST COMMON ANCESTOR NODE: | |
if (defined($lca->get_tag_value('B'))) { $bootstrap = $lca->get_tag_value('B');} | |
else { $bootstrap = 0; } | |
# CHECK IF THE $lca NODE IS A SPECIATION NODE: | |
if (defined($lca->get_tag_value('D'))) | |
{ | |
$lca_type = $lca->get_tag_value('D'); | |
} | |
else | |
{ | |
print "WARNING: AC $AC: do not know lca type!\n"; | |
$lca_type = "none"; | |
} | |
if ($lca_type eq 'N') # N IS A SPECIATION NODE. | |
{ | |
# FIND THE ORTHOLOGY BOOTSTRAP (IF DEFINED) FOR $species1_gene AND $species2_gene: | |
$pair = $species1_gene."_".$species2_gene; | |
if ($ORTHOSTRAP{$pair}) | |
{ | |
$orthostrap = $ORTHOSTRAP{$pair}; | |
} | |
else | |
{ | |
$orthostrap = "NA"; | |
} | |
print "family $AC species1_gene $species1_gene species2_gene $species2_gene bootstrap=$bootstrap orthostrap=$orthostrap\n"; | |
} | |
} | |
} | |
NEXT_FAMILY: | |
} | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment