Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 13:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5064651 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5064651 to your computer and use it in GitHub Desktop.
Perl script to find genes that appear in more than one TreeFam-A seed tree
#!/usr/local/bin/perl
#
# Perl script treefam_overlaps2.pl
# Written by Avril Coghlan (alc@sanger.ac.uk).
# 25-Aug-06.
#
# For the TreeFam project.
#
# This perl script connects to the MYSQL database of
# TreeFam families and prints out a list of the genes that
# appear in more than one TreeFam-A seed tree. This script
# uses Jean-Karim's TreeFam API.
#
# The command-line format is:
# % perl <treefam_overlaps2.pl>
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 0)
{
print "Usage of treefam_overlaps2.pl\n\n";
print "perl -w treefam_overlaps2.pl\n";
print "For example, >perl -w treefam_overlaps2.pl\n";
exit;
}
# DECLARE MYSQL USERNAME AND HOST:
use strict;
use Treefam::DBConnection;
my $VERBOSE = 0; # IF $VERBOSE==1, PRINT OUT EXTRA DETAILS.
my $dbc = Treefam::DBConnection->new();
#------------------------------------------------------------------#
my %FAMILY = (); # HASH TABLE TO KEEP A RECORD OF THE TREEFAM-A FAMILIES THAT A GENE IS IN.
# GET ALL THE TREEFAM-A FAMILIES IN THE CURRENT RELEASE:
my $famh = $dbc->get_FamilyHandle(); # GET A FAMILY HANDLE.
my @families = $famh->get_all_by_type('A'); # GET ALL TREEFAM-A FAMILIES.
my $no_families = 0;
foreach my $family(@families)
{
if ($VERBOSE == 1) { print $family->ID,"\n";}
my $AC = $family->ID(); # GET THE FAMILY ID.
$no_families++;
print STDERR "$no_families: reading family $AC...\n";
my $tree = $family->tree('seed'); # GET THE TREEFAM-A SEED TREE.
# GET THE NAMES OF ALL THE GENES IN THIS TREE:
my @leaves = $tree->get_leaf_nodes;
foreach my $leaf(@leaves)
{
my @tags = $leaf->get_all_tags();
if ($VERBOSE == 1) { print "ID: ",$leaf->id(),"\t"; } # THIS IS THE SYMBOL."_".SPECIES, eg. srb11_SCHP0 OR CCNC_HUMAN
foreach my $tag(@tags)
{
# THE TAGS ARE:
# G: GENE ID, eg. SPBC12D12.06 OR ENSG00000112237
# O: TREEFAM ID. eg. SPBC12D12.06 OR ENST00000326298.2
# S: SPECIES, eg. SCHP0 OR HUMAN
for my $value ($leaf->get_tag_values($tag))
{
if ($tag eq 'G')
{
my $GID = $value;
if ($VERBOSE == 1) { print "gene $GID (AC $AC)\n";}
# REMEMBER THE FAMILIES THAT THIS GENE IS IN:
if (!($FAMILY{$GID})) { $FAMILY{$GID} = $AC; }
else { $FAMILY{$GID} = $FAMILY{$GID}.",".$AC;}
}
}
}
}
}
if ($VERBOSE == 1) { print "\n\n";}
#------------------------------------------------------------------#
# PRINT OUT A LIST OF THE GENES THAT APPEAR IN TREEFAM-A SEED FAMILIES, AND THE
# FAMILIES THAT THEY APPEAR IN:
print "GENE NUMBER_OF_FAMILIES FAMILIES\n";
my $no_overlapping_families = 0;
my $no_genes = 0;
my %SEEN = ();
foreach my $GID (keys %FAMILY)
{
my $family = $FAMILY{$GID};
my @family = split(/\,/,$family); # THIS IS A LIST OF THE FAMILIES THAT A GENE APPEARS IN.
my $no_families = $#family + 1; # THIS IS THE NUMBER OF FAMILIES THAT A GENE APPEARS IN.
if ($no_families > 1)
{
if ($SEEN{$GID}) { print STDERR "ERROR: already counted gene $GID.\n"; exit;}
$SEEN{$GID} = 1;
$no_genes++;
print "$GID $no_families $family\n";
# CHECK IF WE HAVE SEEN THESE FAMILIES BEFORE:
for (my $i = 0; $i < $no_families; $i++)
{
my $this_family = $family[$i];
if (!($SEEN{$this_family}))
{
$no_overlapping_families++;
$SEEN{$this_family} = 1;
}
}
}
}
print "There are $no_genes genes in more than one TreeFam-A seed family.\n";
print "They are in $no_overlapping_families different TreeFam-A seed families.\n";
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment