Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 13:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save avrilcoghlan/5064629 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5064629 to your computer and use it in GitHub Desktop.
Perl script to identify gene losses in human since divergence from chimp, based on TreeFam trees
#!/usr/local/bin/perl
#
# Perl script treefam_genelosses.pl
# Written by Avril Coghlan (alc@sanger.ac.uk).
# 28-Aug-06.
#
# For the TreeFam project.
#
# This perl script connects to the MYSQL database of
# TreeFam families and prints out a list of gene losses from
# fully sequenced genomes. This script uses Jean-Karim's TreeFam
# API. Finds losses in TreeFam-A clean trees (not seed, as misses fully sequenced genomes,
# and not full trees, as are sometimes wrong). xxx use TreeFam-B too?
#
# The command-line format is:
# % perl <treefam_genelosses.pl>
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 0)
{
print "Usage of treefam_genelosses.pl\n\n";
print "perl -w treefam_genelosses.pl\n";
print "For example, >perl -w treefam_genelosses.pl\n";
exit;
}
# DECLARE MYSQL USERNAME AND HOST:
use strict;
use Treefam::DBConnection;
my $VERBOSE = 0; # IF $VERBOSE==1, PRINT OUT EXTRA DETAILS.
my $dbc = Treefam::DBConnection->new();
#------------------------------------------------------------------#
# GET ALL THE TREEFAM-A FAMILIES IN THE CURRENT RELEASE:
my $famh = $dbc->get_FamilyHandle(); # GET A FAMILY HANDLE.
my @families = $famh->get_all_by_type('A'); # GET ALL TREEFAM-A FAMILIES.
my $no_families = 0;
foreach my $family(@families)
{
my $AC = $family->ID(); # GET THE FAMILY ID.
$no_families++;
print "\n$no_families: reading family $AC...\n";
my $tree = $family->tree('clean'); # GET THE TREEFAM-A CLEAN TREE. xxx look at treefam-b too?
# GET THE NAMES OF ALL THE GENES IN THIS TREE:
my @leaves = $tree->get_leaf_nodes;
foreach my $leaf(@leaves)
{
my @tags = $leaf->get_all_tags();
my $is_gene_loss = 0;
my $loss = "none";
foreach my $tag(@tags)
{
# THE TAGS ARE:
# G: GENE ID, eg. SPBC12D12.06 OR ENSG00000112237
# O: TREEFAM ID. eg. SPBC12D12.06 OR ENST00000326298.2
# S: SPECIES, eg. SCHP0 OR HUMAN
# E: GENE LOSS
# B: BOOTSTRAP VALUE
for my $value ($leaf->get_tag_values($tag))
{
if ($tag eq 'E')
{
$loss = $value;
$is_gene_loss = 1;
my @temp = split(/\$\-/,$loss);
$loss = $temp[1];
if ($loss ne 'HUMAN' && $loss ne 'Homo') { goto NEXT_LEAF;} # xxx JUST LOOKING AT LOSES OF HUMAN GENES WHERE THERE IS A CHIMP GENE.
# NOTE THAT IF CHIMP AND HUMAN WERE LOST, THEN $loss WOULD BE 'HUMAN,PANTR'.
print "___ $AC: ID ",$leaf->id(),"\n"; # THIS IS SYMBOL."_".SPECIES, eg. CCNC_HUMAN
print "___ GENE LOSS $loss (AC $AC)\n";
}
}
}
# IF THIS NODE $leaf->id() HAS A GENE LOSS ASSOCIATED WITH IT, CHECK WHETHER
# THE CLADE THAT IT IS IN IS GOOD EVIDENCE FOR THE LOSS:
my $parent = 'none';
if ($is_gene_loss == 1)
{
#---------------------------------------------------------#
FIND_PARENT:
# FIND THE PARENT NODE OF NODE $leaf:
if ($parent eq 'none')
{
if (defined($leaf->ancestor)) { $parent = $leaf->ancestor; }
else { goto ALL_PARENTS_FOUND; }
}
else # GET THE PARENT NODE OF NODE $parent:
{
if (defined($parent->ancestor)){ $parent = $parent->ancestor;}
else { goto ALL_PARENTS_FOUND; }
}
if ($VERBOSE == 1) { print "___ PARENT OF CURRENT NODE is", $parent->internal_id(),"\n";}
# FIND THE DESCENDENTS OF NODE $parent, AND CHECK IF THE CLADE OF DESCENDENTS
# IS GOOD EVIDENCE FOR THE GENE LOSS $loss:
my %SPECIES = (); # HASH TABLE TO RECORD SPECIES IN THIS CLADE.
for my $descendent ($parent->get_all_Descendents())
{
if (defined($descendent->id())) # IF THIS IS A LEAF.
{
my $id = $descendent->id();
if ($VERBOSE == 1) { print "___ WHICH HAS DESCENDENT $id\n";}
my @temp = split(/_/,$id);
my $species = $temp[$#temp];
$SPECIES{$species} = 1;
}
}
# CHECK WHETHER THE CLADE OF DESCENDENTS IS GOOD EVIDENCE FOR THE GENE LOSS $loss:
my $evidence_for_loss= &check_evidence_for_loss($loss,\%SPECIES);
if ($VERBOSE == 1) { print "___ EVIDENCE FOR LOSS IN $loss: $evidence_for_loss\n";}
if ($evidence_for_loss == 1) { goto ALL_PARENTS_FOUND;}
# FIND THE PARENT OF NODE $parent:
goto FIND_PARENT;
#---------------------------------------------------------#
ALL_PARENTS_FOUND:
if ($VERBOSE == 1) { print "___ ALL PARENTS FOUND\n\n";}
# IF THE CLADE OF DESCENDENTS IS GOOD EVIDENCE FOR GENE LOSS $loss, CHECK IF THERE
# ARE ANY OTHER LOSSES IN THAT CLADE THAT MAY MAKE $loss LESS BELIEVABLE:
print "___ EVIDENCE FOR LOSS IN $loss: $evidence_for_loss\n";
if ($evidence_for_loss == 1)
{
for my $descendent ($parent->get_all_Descendents())
{
if (defined($descendent->id())) # IF THIS IS A LEAF.
{
my $id = $descendent->id();
my ($loss2) = $descendent->get_tag_values('E');
if (defined($loss2))
{
my @temp2 = split(/\$-/,$loss2);
my $species2= $temp2[1];
print "descendent $id -> loss $loss2\n";
if (($loss eq 'HUMAN' || $loss eq 'Homo') && ($species2 =~ /PANTR/ || $species2 =~ /Pan/)) # xxx add other cases
{
$evidence_for_loss = 0;
}
}
}
}
}
print "___ FINAL EVIDENCE FOR LOSS IN $loss: now $evidence_for_loss\n";
#---------------------------------------------------------#
}
NEXT_LEAF: # xxx
}
}
print "\n\n";
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# SUBROUTINE TO CHECK WHETHER A CLADE OF GENES IS GOOD EVIDENCE FOR A GENE LOSS:
sub check_evidence_for_loss
{
my $loss = $_[0]; # THE SPECIES IN WHICH THE PUTATIVE GENE LOSS OCCURRED.
my $SPECIES = $_[1]; # POINTER TO HASH TABLE OF SPECIES IN THE CLADE THAT SPECIES $species IS MISSING FROM.
my $evidence = 1; # SAYS WHETHER THE EVIDENCE FOR GENE LOSS IS STRONG.
my @all = ('HUMAN','PANTR','MACMU','BOVIN','CANFA','MOUSE','RAT','MONDO',
'CHICK','XENTR','TETNG','FUGRU','BRARE','CIOIN','DROME','DROPS',
'CAEEL','CAEBR','CAERE','APIME','ANOGA','YEAST','SCHPO','ARATH','SCHMA');
# ALL THE FULLY SEQUENCED SPECIES.
my @expect; # THE SPECIES THAT WE EXPECT TO SEE IN THE CLADE.
my $i; #
my %EXPECT = (); #
my $expecti; #
my @expecti; #
my $j; #
my $found_expecti = 0; #
my $loss2; #
# CHECK WHETHER THERE IS STRONG EVIDENCE FOR A GENE LOSS: # xxx ADD MORE CASES
# FOR EXAMPLE, WE CAN BE SURE OF A HUMAN GENE LOSS IF THE CLADE CONTAINS AT LEAST
# ONE CHIMP OR ONE MACAQUE, AT LEAST ONE RAT OR ONE MOUSE, AT LEAST ONE COW, AND
# AT LEAST ONE DOG GENE:
if ($loss eq 'HUMAN') { @expect = ('PANTR','MACMU','MOUSE/RAT','BOVIN/CANFA'); }
# CHECK THAT WE SEE ALL THE SPECIES THAT WE EXPECT IN THE CLADE, AND NO OTHER SPECIES:
for ($i = 0; $i <= $#expect; $i++)
{
$expecti = $expect[$i];
@expecti = split(/\//,$expecti);
$found_expecti = 0;
# CHECK WHETHER WE SEE AT LEAST ONE OF PANTR/MACMU FOR EXAMPLE:
for ($j = 0; $j <= $#expecti; $j++)
{
$EXPECT{$expecti[$j]} = 1;
if ($SPECIES->{$expecti[$j]}) { $found_expecti = 1;}
}
if ($found_expecti == 0) { $evidence = 0;} # WE DON'T FIND A SPECIES THAT WE EXPECT TO SEE IN THE CLADE.
}
# CHECK IF WE SEE ANY SPECIES IN THE CLADE THAT WE DON'T EXPECT TO SEE:
for ($i = 0; $i <= $#all; $i++)
{
if (!($EXPECT{$all[$i]}) && $all[$i] ne $loss)
{
if ($SPECIES->{$all[$i]}) { $evidence = 0;} # WE FIND A SPECIES THAT WE DON'T EXPECT TO SEE IN THE CLADE.
}
}
return($evidence);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment