Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 14:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5064924 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5064924 to your computer and use it in GitHub Desktop.
Perl script that identifies singleton genes in a species, by finding genes from that species that appear in TreeFam families that do not have any other genes from that species
#! /usr/bin/perl
# Avril Coghlan
# 24-Feb-09
# Perl script to identify singleton genes in a species, defined by genes
# that do not appear in a TreeFam family with any gene from the same species.
use DBI;
use lib "/home/bcri/acoghlan/perlmodulesAvril/"; # edit to point to the TreeFam perl api xxx
use Treefam::DBConnection;
$the_species = $ARGV[0]; # the species name, eg. CAEEL
$release = $ARGV[1]; # the treefam release to use
$VERBOSE = 1;
# get all the TreeFam families that contain C. elegans genes:
$dbc = Treefam::DBConnection->new();
$famh = $dbc->get_FamilyHandle();
my @families = $famh->get_all_by_species($the_species);
%SINGLETON = ();
for ($i = 0; $i <= $#families; $i++)
{
$family = $families[$i];
# Note: it seems to be necessary to go around a quite circular route to get the
# treefam family object in treefam-7 (this wasn't necessary in treefam-6):
if ($family eq '') { print STDERR "WARNING: family $family\n"; goto NEXT_FAMILY;}
if ($VERBOSE == 1) { print "1. Looking at family $family...\n";}
if (defined($family->AC())) { $family = $family->AC(); }
if ($VERBOSE == 1) { print "2. Looking at family $family...\n";}
# GET THE FAMILY FOR THIS TREE:
$famh = $dbc->get_FamilyHandle();
$family = $famh->get_by_id($family);
# GET THE TREE FOR THE FAMILY:
if (!(defined($family->get_tree('clean'))))
{
print "WARNING: AC $AC: there is no clean tree!\n";
goto NEXT_FAMILY;
}
$tree = $family->get_tree('clean'); # GET THE TREEFAM CLEAN TREE.
if (!(defined($tree->get_leaves)))
{
print "WARNING: AC $AC: no leaves in tree!\n";
goto NEXT_FAMILY;
}
$num_species_genes = 0;
$species_genes = "";
@nodes = $tree->get_leaves;
foreach $node(@nodes)
{
# FIND THE SPECIES FOR THIS TRANSCRIPT:
if (defined($node->get_tag_value('S')))
{
$species = $node->get_tag_value('S');
}
else
{
print "WARNING: AC $AC: do not know species of node\n";
$species = "none";
}
$species =~ tr/[a-z]/[A-Z]/;
if ($species eq $the_species)
{
$num_species_genes++;
# FIND THE GENE FOR THIS TRANSCRIPT:
if ($the_species eq 'CAEEL' && $release eq '6')
{
$gene = $node->sequence_id(); # eg. T24D1.1b, note $node->gene gives the wbg gene name
$gene =~ tr/[a-z]/[A-Z]/;
$last_letter = substr($gene,length($gene)-1,1);
if ($last_letter =~ /[A-Z]/) { chop($gene);}
}
elsif ($the_species eq 'CAEEL' && $release eq '7')
{
$gene = $node->gene();
$gene = $gene->ID();
}
elsif ($the_species eq 'CAEEL' && $release eq '5')
{
$gene = $node->gene(); # eg. MTCE.3 or T24D1.1
$gene = $gene->ID();
}
elsif ($the_species eq 'CAEEL' && $release eq '4')
{
$gene = $node->gene(); # eg. MTCE.3 or T24D1.1
$gene = $gene->ID();
}
elsif (($the_species eq 'HUMAN' || $the_species eq 'MOUSE') && $release eq '7')
{
$gene = $node->gene(); # eg. ENSG00000215614
$gene = $gene->ID();
}
elsif (($the_species eq 'HUMAN' || $the_species eq 'MOUSE') && $release eq '6')
{
$gene = $node->gene(); # eg. ENSG00000215614
$gene = $gene->ID();
}
elsif (($the_species eq 'HUMAN' || $the_species eq 'MOUSE') && $release eq '5')
{
$gene = $node->gene(); # eg. ENSG00000215614
$gene = $gene->ID();
}
elsif (($the_species eq 'HUMAN' || $the_species eq 'MOUSE') && $release eq '4')
{
$gene = $node->gene(); # eg. ENSG00000215614
$gene = $gene->ID();
}
elsif ($the_species eq 'DROME' && $release eq '7')
{
$gene = $node->gene();
$gene = $gene->ID();
}
else
{
print STDERR "ERROR: the_species $the_species release $release\n";
exit;
}
$species_genes = $species_genes.",".$gene;
}
}
if ($num_species_genes == 1)
{
$gene = substr($species_genes,1,length($species_genes)-1);
if (!($SINGLETON{$gene})) { $SINGLETON{$gene} = "yes";}
}
elsif ($num_species_genes >= 2)
{
$species_genes = substr($species_genes,1,length($species_genes)-1);
@species_genes = split(/\,/,$species_genes);
for ($j = 0; $j <= $#species_genes; $j++)
{
$gene = $species_genes[$j];
$SINGLETON{$gene} = "no";
}
}
NEXT_FAMILY:
}
# PRINT OUT ALL THE SINGLETON GENES:
foreach $gene (keys %SINGLETON)
{
if ($SINGLETON{$gene} eq 'yes')
{
print "$gene\n";
}
}
print STDERR "FINISHED\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment