Created
March 1, 2013 16:33
-
-
Save avrilcoghlan/5065850 to your computer and use it in GitHub Desktop.
Perl script that prints out all the gene losses identified in a particular TreeFam family based on the tree for the family.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_4_losses.pl | |
# Written by Avril Coghlan (alc@sanger.ac.uk) | |
# 24-Oct-07. | |
# | |
# For the TreeFam project. | |
# | |
# This perl script prints out the gene losses in a certain | |
# TreeFam-4 family. | |
# | |
# The command-line format is: | |
# % perl <treefam_4_losses.pl> family | |
# where family is the family identifier. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 1) | |
{ | |
print "Usage of treefam_4_losses.pl\n\n"; | |
print "perl treefam_4_losses.pl <family>\n"; | |
print "where <family> is the family identifier.\n"; | |
print "For example, >perl -w treefam_4_losses.pl TF101000\n"; | |
exit; | |
} | |
# READ IN MY PERL MODULES: | |
BEGIN { | |
unshift (@INC, '/nfs/team71/phd/alc/perl/modules'); | |
} | |
use Avril_modules; | |
use lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_3may07/"; | |
use DBI; | |
use Treefam::DBConnection; | |
$VERBOSE = 1; | |
$VERBOSE2 = 1; | |
# FIND THE NAME OF THE INPUT FAMILY: | |
$family_id = $ARGV[0]; | |
#------------------------------------------------------------------# | |
# GET ALL THE GENE LOSSES IN THIS FAMILY: | |
$dbc = Treefam::DBConnection->new(); | |
$famh = $dbc->get_FamilyHandle(); | |
$family = $famh->get_by_id($family_id); | |
if (!(defined($family))) | |
{ | |
print "Do not have $family_id in the database...\n"; | |
exit; | |
} | |
my $tree = $family->get_tree('clean'); | |
# FIRST PRINT ALL THE GENES IN THE FAMILY: | |
my @leaves = $tree->get_leaves; | |
foreach my $leaf(@leaves) | |
{ | |
if (defined($leaf->sequence_id())) # FOR SOME LEAVES, WE DON'T SEE TO HAVE | |
# A SEQUENCE ID, eg. SEE RICE GENE IN http://www.treefam.org/cgi-bin/TFinfo.pl?ac=TF101025 | |
{ | |
my $id = $leaf->sequence_id(); | |
# GET THE ENSEMBL/ORIGINAL GENE NAME: | |
my $gene_handle = $dbc->get_GeneHandle(); | |
if (defined($gene_handle->get_by_sequence_id($id))) { $gene = $gene_handle->get_by_sequence_id($id);} | |
else { print STDERR "ERROR: cannot get gene for id $id\n"; exit; } | |
if (defined($gene->ID)) { $gene_id = $gene->ID;} | |
else { $gene_id = "unknown";} | |
print "HAVE GENE $gene_id\n"; | |
} | |
} | |
# GET THE NAMES OF ALL THE GENES IN THIS TREE: | |
foreach my $leaf(@leaves) | |
{ | |
my @tags = $leaf->get_all_tags(); | |
my $is_gene_loss = 0; | |
my $loss = "none"; | |
foreach my $tag(@tags) | |
{ | |
# THE TAGS ARE: | |
# G: GENE ID, eg. SPBC12D12.06 OR ENSG00000112237 | |
# O: TREEFAM ID. eg. SPBC12D12.06 OR ENST00000326298.2 | |
# S: SPECIES, eg. SCHP0 OR HUMAN | |
# E: GENE LOSS | |
# B: BOOTSTRAP VALUE | |
for my $value ($leaf->get_tag_value($tag)) | |
{ | |
if ($tag eq 'E') # IT IS A GENE LOSS. | |
{ | |
$loss = $value; | |
$is_gene_loss = 1; | |
my @temp = split(/\$\-/,$loss); | |
$loss = $temp[1]; | |
# JUST LOOKING AT LOSSES OF YEAST AND DROSOPHILA GENES. | |
if ($VERBOSE2 == 1) | |
{ | |
print "___ $family_id: ID ",$leaf->id(),"\n"; # THIS IS SYMBOL."_".SPECIES, eg. CCNC_HUMAN | |
} | |
print "___ GENE LOSS $loss (AC $family_id)\n"; | |
} | |
} | |
} | |
# IF THIS NODE $leaf->id() HAS A GENE LOSS ASSOCIATED WITH IT, CHECK WHETHER | |
# THE CLADE THAT IT IS IN IS GOOD EVIDENCE FOR THE LOSS: | |
my $parent = 'none'; | |
if ($is_gene_loss == 1) | |
{ | |
#---------------------------------------------------------# | |
FIND_PARENT: | |
# FIND THE PARENT NODE OF NODE $leaf: | |
if ($parent eq 'none') | |
{ | |
if (defined($leaf->parent)) { $parent = $leaf->parent; } | |
else { goto ALL_PARENTS_FOUND; } | |
} | |
else # GET THE PARENT NODE OF NODE $parent: | |
{ | |
if (defined($parent->parent)) { $parent = $parent->parent; } | |
else { goto ALL_PARENTS_FOUND; } | |
} | |
if ($VERBOSE == 1) { print "___ PARENT OF CURRENT NODE is", $parent->internalID,"\n";} | |
# FIND THE DESCENDENTS OF NODE $parent, AND CHECK IF THE CLADE OF DESCENDENTS | |
# IS GOOD EVIDENCE FOR THE GENE LOSS $loss: | |
my %SPECIES = (); # HASH TABLE TO RECORD SPECIES IN THIS CLADE. | |
for my $descendent ($parent->children()) | |
{ | |
if (defined($descendent->sequence_id())) # IF THIS IS A LEAF. | |
{ | |
my $id = $descendent->sequence_id(); | |
# GET THE ENSEMBL/ORIGINAL GENE NAME: | |
my $gene_handle= $dbc->get_GeneHandle(); | |
if (defined($gene_handle->get_by_sequence_id($id))) { $gene = $gene_handle->get_by_sequence_id($id);} | |
else { print STDERR "ERROR: cannot get gene for id $id.\n"; exit; } | |
if (defined($gene->ID)) { $gene_id = $gene->ID;} | |
else { $gene_id = "unknown";} | |
if ($VERBOSE == 1) { print "___ WHICH HAS DESCENDENT $gene_id\n";} | |
my @temp = split(/_/,$id); | |
my $species = $temp[$#temp]; | |
$SPECIES{$species} = 1; | |
} | |
} | |
# CHECK WHETHER THE CLADE OF DESCENDENTS IS GOOD EVIDENCE FOR THE GENE LOSS $loss: | |
my $evidence_for_loss= 1; | |
if ($VERBOSE2 == 1) { print "___ EVIDENCE FOR LOSS IN $loss: $evidence_for_loss\n";} | |
if ($evidence_for_loss == 1) { goto ALL_PARENTS_FOUND;} | |
# FIND THE PARENT OF NODE $parent: | |
goto FIND_PARENT; | |
#---------------------------------------------------------# | |
ALL_PARENTS_FOUND: | |
if ($VERBOSE2 == 1) { print "___ ALL PARENTS FOUND\n\n";} | |
#---------------------------------------------------------# | |
} | |
NEXT_LEAF: | |
} | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment