Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 14:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5065069 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065069 to your computer and use it in GitHub Desktop.
Perl script that finds cases where a TreeFam family is listed in the TreeFam mysql database, but has no genes listed in the 'fam_genes' table.
#!/usr/local/bin/perl
#
# Perl script treefam_QC3.pl
# Written by Avril Coghlan (a.coghlan@ucc.ie)
# 3-Feb-09.
#
# This perl script finds cases where there is a TreeFam family listed in the
# 'familyA' or 'familyB' or 'familyC' tables, but it has no genes listed in
# the 'fam_genes' table.
#
# The command-line format is:
# % perl <treefam_QC3.pl> <release>
# where <release> is the release of the TreeFam database to use.
#
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 1)
{
print "Usage of treefam_QC3.pl\n\n";
print "perl treefam_QC3.pl <release>\n";
print "where <release> is the release of the TreeFam database to use.\n";
print "For example, >perl -w treefam_QC3.pl 7\n";
exit;
}
#------------------------------------------------------------------#
# DECLARE MYSQL USERNAME AND HOST:
use DBI;
# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE:
$release = $ARGV[0];
#------------------------------------------------------------------#
# READ IN THE FAMILIIES THAT APPEAR IN THE 'fam_genes' TABLE:
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
$dbh = DBI->connect("$database", 'anonymous', '') || return;
$FAMILY = &read_genes_in_families($dbh);
# FIND IF THERE ARE FAMILIES IN THE FAMILYA/FAMILYB/FAMILYC TABLES,
# THAT DO NOT APPEAR IN THE fam_genes TABLE:
&read_all_families($dbh,$FAMILY);
# NOW DISCONNECT FROM THE DATABASE:
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# FIND IF THERE ARE FAMILIES IN THE FAMILYA/FAMILYB/FAMILYC TABLES,
# THAT DO NOT APPEAR IN THE fam_genes TABLE:
sub read_all_families
{
my $dbh = $_[0];
my $FAMILY = $_[1];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $i;
my %SEEN;
my $key;
my @temp;
for ($i = 1; $i <= 3; $i++)
{
if ($i == 1) { $table_w = 'familyA';}
elsif ($i == 2) { $table_w = 'familyB';}
elsif ($i == 3) { $table_w = 'familyC';}
$st = "SELECT AC from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
$SEEN{$AC} = 1;
# CHECK IF GENES IN THE SEED FAMILY FOR $AC APPEAR IN THE
# fam_genes TABLE. NOTE THAT THE GENES IN THE SEED FAMILY ARE
# ONLY STORED FOR TREEFAM-A (TF1XXXXX) FAMILIES:
if (substr($AC,0,3) eq 'TF1')
{
if (!($FAMILY->{$AC."=SEED"}))
{
print "WARNING: the fam_genes table doesn't contain genes in the $AC SEED family\n";
}
}
# CHECK IF GENES IN THE FULL FAMILY FOR $AC APPEAR IN THE
# fam_genes TABLE:
if (!($FAMILY->{$AC."=FULL"}))
{
print "WARNING: the fam_genes table doesn't contain genes in the $AC FULL family\n";
}
}
}
}
# CHECK IF THERE ARE GENES FOR FAMILIES THAT APPEAR IN THE fam_genes
# TABLE, BUT WHERE THE FAMILY DOESN'T APPEAR IN THE familyA OR familyB
# OR familyC TABLES:
foreach $key (keys %{$FAMILY})
{
@temp = split(/=/,$key);
$AC = $temp[0];
if (!($SEEN{$AC}))
{
print "WARNING: family $AC appears in the fam_genes table but not in the familyA/familyB/familyC tables\n";
}
}
}
#------------------------------------------------------------------#
# READ THE GENES THAT WERE PUT INTO FAMILIES:
sub read_genes_in_families
{
my $dbh = $_[0];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $AC;
my $FLAG;
my %FAMILY = ();
my $ID;
$table_w = 'fam_genes';
$st = "SELECT AC, FLAG, ID from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
$FLAG = $array[1];
$ID = $array[2];
# SOMETIMES $ID CAN BE '_null_;;' (eg. see TF328536 in TREEFAM-7):
if (!($ID =~ /null/) && !($ID =~ /NULL/))
{
# THIS TABLE 'fam_genes' HAS 'FULL'/'SEED' FAMILIES:
$FAMILY{$AC."=".$FLAG} = 1;
}
}
}
print STDERR "Read in genes in each family\n";
return(\%FAMILY);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment