Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 16:33
Show Gist options
  • Save avrilcoghlan/5065850 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065850 to your computer and use it in GitHub Desktop.
Perl script that prints out all the gene losses identified in a particular TreeFam family based on the tree for the family.
#!/usr/local/bin/perl
#
# Perl script treefam_4_losses.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 24-Oct-07.
#
# For the TreeFam project.
#
# This perl script prints out the gene losses in a certain
# TreeFam-4 family.
#
# The command-line format is:
# % perl <treefam_4_losses.pl> family
# where family is the family identifier.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 1)
{
print "Usage of treefam_4_losses.pl\n\n";
print "perl treefam_4_losses.pl <family>\n";
print "where <family> is the family identifier.\n";
print "For example, >perl -w treefam_4_losses.pl TF101000\n";
exit;
}
# READ IN MY PERL MODULES:
BEGIN {
unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
}
use Avril_modules;
use lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_3may07/";
use DBI;
use Treefam::DBConnection;
$VERBOSE = 1;
$VERBOSE2 = 1;
# FIND THE NAME OF THE INPUT FAMILY:
$family_id = $ARGV[0];
#------------------------------------------------------------------#
# GET ALL THE GENE LOSSES IN THIS FAMILY:
$dbc = Treefam::DBConnection->new();
$famh = $dbc->get_FamilyHandle();
$family = $famh->get_by_id($family_id);
if (!(defined($family)))
{
print "Do not have $family_id in the database...\n";
exit;
}
my $tree = $family->get_tree('clean');
# FIRST PRINT ALL THE GENES IN THE FAMILY:
my @leaves = $tree->get_leaves;
foreach my $leaf(@leaves)
{
if (defined($leaf->sequence_id())) # FOR SOME LEAVES, WE DON'T SEE TO HAVE
# A SEQUENCE ID, eg. SEE RICE GENE IN http://www.treefam.org/cgi-bin/TFinfo.pl?ac=TF101025
{
my $id = $leaf->sequence_id();
# GET THE ENSEMBL/ORIGINAL GENE NAME:
my $gene_handle = $dbc->get_GeneHandle();
if (defined($gene_handle->get_by_sequence_id($id))) { $gene = $gene_handle->get_by_sequence_id($id);}
else { print STDERR "ERROR: cannot get gene for id $id\n"; exit; }
if (defined($gene->ID)) { $gene_id = $gene->ID;}
else { $gene_id = "unknown";}
print "HAVE GENE $gene_id\n";
}
}
# GET THE NAMES OF ALL THE GENES IN THIS TREE:
foreach my $leaf(@leaves)
{
my @tags = $leaf->get_all_tags();
my $is_gene_loss = 0;
my $loss = "none";
foreach my $tag(@tags)
{
# THE TAGS ARE:
# G: GENE ID, eg. SPBC12D12.06 OR ENSG00000112237
# O: TREEFAM ID. eg. SPBC12D12.06 OR ENST00000326298.2
# S: SPECIES, eg. SCHP0 OR HUMAN
# E: GENE LOSS
# B: BOOTSTRAP VALUE
for my $value ($leaf->get_tag_value($tag))
{
if ($tag eq 'E') # IT IS A GENE LOSS.
{
$loss = $value;
$is_gene_loss = 1;
my @temp = split(/\$\-/,$loss);
$loss = $temp[1];
# JUST LOOKING AT LOSSES OF YEAST AND DROSOPHILA GENES.
if ($VERBOSE2 == 1)
{
print "___ $family_id: ID ",$leaf->id(),"\n"; # THIS IS SYMBOL."_".SPECIES, eg. CCNC_HUMAN
}
print "___ GENE LOSS $loss (AC $family_id)\n";
}
}
}
# IF THIS NODE $leaf->id() HAS A GENE LOSS ASSOCIATED WITH IT, CHECK WHETHER
# THE CLADE THAT IT IS IN IS GOOD EVIDENCE FOR THE LOSS:
my $parent = 'none';
if ($is_gene_loss == 1)
{
#---------------------------------------------------------#
FIND_PARENT:
# FIND THE PARENT NODE OF NODE $leaf:
if ($parent eq 'none')
{
if (defined($leaf->parent)) { $parent = $leaf->parent; }
else { goto ALL_PARENTS_FOUND; }
}
else # GET THE PARENT NODE OF NODE $parent:
{
if (defined($parent->parent)) { $parent = $parent->parent; }
else { goto ALL_PARENTS_FOUND; }
}
if ($VERBOSE == 1) { print "___ PARENT OF CURRENT NODE is", $parent->internalID,"\n";}
# FIND THE DESCENDENTS OF NODE $parent, AND CHECK IF THE CLADE OF DESCENDENTS
# IS GOOD EVIDENCE FOR THE GENE LOSS $loss:
my %SPECIES = (); # HASH TABLE TO RECORD SPECIES IN THIS CLADE.
for my $descendent ($parent->children())
{
if (defined($descendent->sequence_id())) # IF THIS IS A LEAF.
{
my $id = $descendent->sequence_id();
# GET THE ENSEMBL/ORIGINAL GENE NAME:
my $gene_handle= $dbc->get_GeneHandle();
if (defined($gene_handle->get_by_sequence_id($id))) { $gene = $gene_handle->get_by_sequence_id($id);}
else { print STDERR "ERROR: cannot get gene for id $id.\n"; exit; }
if (defined($gene->ID)) { $gene_id = $gene->ID;}
else { $gene_id = "unknown";}
if ($VERBOSE == 1) { print "___ WHICH HAS DESCENDENT $gene_id\n";}
my @temp = split(/_/,$id);
my $species = $temp[$#temp];
$SPECIES{$species} = 1;
}
}
# CHECK WHETHER THE CLADE OF DESCENDENTS IS GOOD EVIDENCE FOR THE GENE LOSS $loss:
my $evidence_for_loss= 1;
if ($VERBOSE2 == 1) { print "___ EVIDENCE FOR LOSS IN $loss: $evidence_for_loss\n";}
if ($evidence_for_loss == 1) { goto ALL_PARENTS_FOUND;}
# FIND THE PARENT OF NODE $parent:
goto FIND_PARENT;
#---------------------------------------------------------#
ALL_PARENTS_FOUND:
if ($VERBOSE2 == 1) { print "___ ALL PARENTS FOUND\n\n";}
#---------------------------------------------------------#
}
NEXT_LEAF:
}
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment