Created
March 1, 2013 15:23
-
-
Save avrilcoghlan/5065347 to your computer and use it in GitHub Desktop.
Perl script that, given a nhx-format tree file for a tree for a TreeFam family, finds Schistosoma mansoni/S. japonicum/Nematostella vectensis paralog pairs, and gives the ancestral taxon in which the duplication giving rise to the paralogs occcurred).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script find_schisto_paralogs.pl | |
# Written by Avril Coghlan (alc@sanger.ac.uk) | |
# 22-Dec-08. | |
# | |
# This perl script reads in a tree and finds S. mansoni/ | |
# S. japonicum/Nematostella vectensis paralog pairs, and gives the | |
# ancestral taxon in which the duplication occurred. | |
# | |
# The command-line format is: | |
# % perl <find_schisto_paralogs.pl> <species> <tree> | |
# <species> is the species we want to find paralogs for, | |
# <tree> is the input nhx format tree file. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 2) | |
{ | |
print "Usage of find_schisto_paralogs.pl\n\n"; | |
print "perl find_schisto_paralogs.pl <species> <tree>\n"; | |
print "where <species> is the species we want to find paralogs for,\n"; | |
print " <tree> is the input nhx format tree file.\n"; | |
print "For example, >perl find_schisto_paralogs.pl SCHMA TF101058.nhx2\n"; | |
exit; | |
} | |
use Treefam::DBConnection; #xxx make sure it points to the right treefam release | |
use DBI; | |
# FIND THE SPECIES WE WANT TO FIND PARALOGS FOR: | |
$the_species = $ARGV[0]; | |
# FIND THE NAME OF THE TREE FILE: | |
$tree_file = $ARGV[1]; | |
#------------------------------------------------------------------# | |
# READ IN THE TREE FILE: | |
$nhx_tree = &read_tree_file($tree_file); | |
@temp = split(/\.nhx2/,$tree_file); | |
$AC = $temp[0]; | |
@temp = split(/\//,$AC); | |
$AC = $temp[$#temp]; | |
#------------------------------------------------------------------# | |
# READ IN THE TREE INTO A TREEFAM tree OBJECT: | |
$dbc = Treefam::DBConnection->new(); | |
$tree = Treefam::Tree->new($dbc,undef,undef,$nhx_tree); | |
# FIND PARALOGS WITHIN THE FAMILY: | |
@leaves = $tree->get_leaves; | |
foreach $leaf(@leaves) | |
{ | |
# FIND THE SPECIES FOR THIS TRANSCRIPT: | |
if (defined($leaf->get_tag_value('S'))) | |
{ | |
$species = $leaf->get_tag_value('S'); | |
} | |
else | |
{ | |
$species = "none"; | |
} | |
$species =~ tr/[a-z]/[A-Z]/; | |
if ($species eq $the_species) # IF IT IS A GENE FROM SPECIES $the_species. | |
{ | |
if (defined($leaf->name())) | |
{ | |
$gene = $leaf->name(); | |
} | |
else | |
{ | |
$gene = "none"; | |
} | |
@species_genes = (@species_genes,$gene); | |
@species_nodes = (@species_nodes,$leaf); | |
} | |
} | |
# FOR EACH PAIR OF $the_species GENES IN THE TREE, SEE WHEN THE DUPLICATION OCCURRED: | |
for ($i = 0; $i <= $#species_genes; $i++) | |
{ | |
$species_gene_i = $species_genes[$i]; | |
$species_node_i = $species_nodes[$i]; | |
for ($j = $i+1; $j <= $#species_genes; $j++) | |
{ | |
$species_gene_j = $species_genes[$j]; | |
$species_node_j = $species_nodes[$j]; | |
# FIND THE LAST COMMON ANCESTOR NODE OF $species_node_j AND $species_node_i: | |
$lca = $tree->get_last_common_ancestor($species_node_j,$species_node_i); | |
if (defined($lca->get_tag_value('S'))) | |
{ | |
$lca_species = $lca->get_tag_value('S'); | |
} | |
else | |
{ | |
$lca_species = "none"; | |
} | |
#---------------------------------------------------------------------------# | |
# PRINT OUT INFORMATION FOR THIS PAIR OF GENES: | |
print "$AC $the_species $species_gene_i $species_gene_j $lca_species\n"; | |
} | |
} | |
print STDERR "FINISHED\n"; | |
#------------------------------------------------------------------# | |
# READ IN THE TREE FILE: | |
sub read_tree_file | |
{ | |
my $tree_file = $_[0]; | |
my $nhx_tree = ""; | |
my $line; | |
open(TREE,"$tree_file") || die "ERROR: cannot open $tree_file\n"; | |
while(<TREE>) | |
{ | |
$line = $_; | |
chomp $line; | |
$nhx_tree = $nhx_tree.$line; | |
} | |
close(TREE); | |
return($nhx_tree); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment