Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 15:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5065204 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065204 to your computer and use it in GitHub Desktop.
Perl script that prints out the total number of genes in families, and the total number of families, in a particular TreeFam release
#!/usr/local/bin/perl
#
# Perl script treefam_release2.pl
# Written by Avril Coghlan (a.coghlan@ucc.ie)
# 10-Feb-09.
#
# This perl script prints out the total number of genes in families,
# and the total number of families, in a particular TreeFam release.
#
# The command-line format is:
# % perl <treefam_release2.pl> <release>
# where <release> is the release of the TreeFam database to use.
#
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 1)
{
print "Usage of treefam_release2.pl\n\n";
print "perl treefam_release2.pl <release>\n";
print "where <release> is the release of the TreeFam database to use.\n";
print "For example, >perl -w treefam_release2.pl 7\n";
exit;
}
#------------------------------------------------------------------#
# DECLARE MYSQL USERNAME AND HOST:
use DBI;
# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE:
$release = $ARGV[0];
#------------------------------------------------------------------#
# FIND THE TOTAL NUMBER OF GENES IN FAMILIES IN THIS TREEFAM RELEASE:
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
$dbh = DBI->connect("$database", 'anonymous', '') || return;
&count_genes_in_families($dbh);
# NOW DISCONNECT FROM THE DATABASE:
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# FIND THE TOTAL NUMBER OF GENES IN FAMILIES IN THIS TREEFAM RELEASE:
sub count_genes_in_families
{
my $dbh = $_[0];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $ID;
my $GID;
my %GID = ();
my %SEEN = ();
my $num_genes = 0;
my %SEEN_FAMILY = ();
my $num_families = 0;
# READ IN THE GENE IDS FOR TRANSCRIPT IDS:
$table_w = 'genes';
$st = "SELECT ID, GID from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$ID = $array[0];
$GID = $array[1];
if ($GID{$ID}) { print STDERR "ERROR: already have GID for ID $ID\n"; exit;}
$GID{$ID} = $GID;
}
}
# READ IN THE GENES IN FAMILIES:
$table_w = 'fam_genes';
$st = "SELECT ID, AC from $table_w WHERE FLAG=\'FULL\'";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$ID = $array[0];
$AC = $array[1];
if ($GID{$ID}) # FOR SOME IDs IN THE fam_genes TABLE, THERE IS NO GID DEFINED IN THE
# genes TABLE. THIS OCCURS FOR TRANSCRIPTS IN SEED TREEFAM-A FAMILIES
# WHICH CAME FROM VERY OLD GENE SETS:
{
$GID = $GID{$ID};
if (!($SEEN{$GID}))
{
$SEEN{$GID} = 1;
$num_genes++;
}
}
if (!($SEEN_FAMILY{$AC}))
{
$SEEN_FAMILY{$AC} = 1;
$num_families++;
}
}
}
print STDERR "Read in $num_genes genes in $num_families families\n";
print "Read in $num_genes genes in $num_families families\n";
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment