Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 15:23
Show Gist options
  • Save avrilcoghlan/5065347 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065347 to your computer and use it in GitHub Desktop.
Perl script that, given a nhx-format tree file for a tree for a TreeFam family, finds Schistosoma mansoni/S. japonicum/Nematostella vectensis paralog pairs, and gives the ancestral taxon in which the duplication giving rise to the paralogs occcurred).
#!/usr/local/bin/perl
#
# Perl script find_schisto_paralogs.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 22-Dec-08.
#
# This perl script reads in a tree and finds S. mansoni/
# S. japonicum/Nematostella vectensis paralog pairs, and gives the
# ancestral taxon in which the duplication occurred.
#
# The command-line format is:
# % perl <find_schisto_paralogs.pl> <species> <tree>
# <species> is the species we want to find paralogs for,
# <tree> is the input nhx format tree file.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 2)
{
print "Usage of find_schisto_paralogs.pl\n\n";
print "perl find_schisto_paralogs.pl <species> <tree>\n";
print "where <species> is the species we want to find paralogs for,\n";
print " <tree> is the input nhx format tree file.\n";
print "For example, >perl find_schisto_paralogs.pl SCHMA TF101058.nhx2\n";
exit;
}
use Treefam::DBConnection; #xxx make sure it points to the right treefam release
use DBI;
# FIND THE SPECIES WE WANT TO FIND PARALOGS FOR:
$the_species = $ARGV[0];
# FIND THE NAME OF THE TREE FILE:
$tree_file = $ARGV[1];
#------------------------------------------------------------------#
# READ IN THE TREE FILE:
$nhx_tree = &read_tree_file($tree_file);
@temp = split(/\.nhx2/,$tree_file);
$AC = $temp[0];
@temp = split(/\//,$AC);
$AC = $temp[$#temp];
#------------------------------------------------------------------#
# READ IN THE TREE INTO A TREEFAM tree OBJECT:
$dbc = Treefam::DBConnection->new();
$tree = Treefam::Tree->new($dbc,undef,undef,$nhx_tree);
# FIND PARALOGS WITHIN THE FAMILY:
@leaves = $tree->get_leaves;
foreach $leaf(@leaves)
{
# FIND THE SPECIES FOR THIS TRANSCRIPT:
if (defined($leaf->get_tag_value('S')))
{
$species = $leaf->get_tag_value('S');
}
else
{
$species = "none";
}
$species =~ tr/[a-z]/[A-Z]/;
if ($species eq $the_species) # IF IT IS A GENE FROM SPECIES $the_species.
{
if (defined($leaf->name()))
{
$gene = $leaf->name();
}
else
{
$gene = "none";
}
@species_genes = (@species_genes,$gene);
@species_nodes = (@species_nodes,$leaf);
}
}
# FOR EACH PAIR OF $the_species GENES IN THE TREE, SEE WHEN THE DUPLICATION OCCURRED:
for ($i = 0; $i <= $#species_genes; $i++)
{
$species_gene_i = $species_genes[$i];
$species_node_i = $species_nodes[$i];
for ($j = $i+1; $j <= $#species_genes; $j++)
{
$species_gene_j = $species_genes[$j];
$species_node_j = $species_nodes[$j];
# FIND THE LAST COMMON ANCESTOR NODE OF $species_node_j AND $species_node_i:
$lca = $tree->get_last_common_ancestor($species_node_j,$species_node_i);
if (defined($lca->get_tag_value('S')))
{
$lca_species = $lca->get_tag_value('S');
}
else
{
$lca_species = "none";
}
#---------------------------------------------------------------------------#
# PRINT OUT INFORMATION FOR THIS PAIR OF GENES:
print "$AC $the_species $species_gene_i $species_gene_j $lca_species\n";
}
}
print STDERR "FINISHED\n";
#------------------------------------------------------------------#
# READ IN THE TREE FILE:
sub read_tree_file
{
my $tree_file = $_[0];
my $nhx_tree = "";
my $line;
open(TREE,"$tree_file") || die "ERROR: cannot open $tree_file\n";
while(<TREE>)
{
$line = $_;
chomp $line;
$nhx_tree = $nhx_tree.$line;
}
close(TREE);
return($nhx_tree);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment