Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 14:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5065096 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065096 to your computer and use it in GitHub Desktop.
Perl script that finds cases where a full gene set for a species was loaded into the TreeFam mysql database, but no genes from that species were added to families.
#!/usr/local/bin/perl
#
# Perl script treefam_QC5.pl
# Written by Avril Coghlan (a.coghlan@ucc.ie)
# 4-Feb-09.
# Updated 26-June-09 to work with TreeFam release 8.
#
# This perl script finds cases where a full gene set for a species was
# loaded into the TreeFam mysql database, but no genes from that species
# were added to families.
#
# The command-line format is:
# % perl <treefam_QC5.pl> <release>
# where <release> is the release of the TreeFam database to use.
#
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 1)
{
print "Usage of treefam_QC5.pl\n\n";
print "perl treefam_QC5.pl <release>\n";
print "where <release> is the release of the TreeFam database to use.\n";
print "For example, >perl -w treefam_QC5.pl 7\n";
exit;
}
#------------------------------------------------------------------#
# DECLARE MYSQL USERNAME AND HOST:
use DBI;
# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE:
$release = $ARGV[0];
#------------------------------------------------------------------#
# READ THE LIST OF FULLY SEQUENCED SPECIES THAT WERE LOADED INTO THE
# TREEFAM MYSQL DATABASE:
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
$dbh = DBI->connect("$database", 'anonymous', '') || return;
($SPECIES,$SWCODE) = &read_fully_sequenced_species($dbh);
# FOR EACH OF THE FULLY SEQUENCED SPECIES LOADED INTO THE TREEFAM MYSQL
# DATABASE, SEE HOW MANY OF ITS GENES WERE PUT INTO FAMILIES:
foreach $TAX_ID (keys %{$SPECIES})
{
$taxname = $SPECIES->{$TAX_ID}; # eg. Aedes aegypti
if (!($SWCODE->{$TAX_ID})) { print STDERR "ERROR: do not know swcode for $TAX_ID\n"; exit;}
$swcode = $SWCODE->{$TAX_ID};
# CHECK IF THE GENES FROM THIS SPECIES WERE PUT INTO FAMILIES:
($num_genes,$num_genes_in_families) = &read_genes_in_families($dbh,$TAX_ID,$taxname,$swcode,$release);
if ($num_genes > 0)
{
$pc_genes_in_families = $num_genes_in_families*100/$num_genes;
}
else
{
$pc_genes_in_families = 0;
}
if ($pc_genes_in_families < 70)
{
print "WARNING: of $num_genes $taxname genes, $pc_genes_in_families % ($num_genes_in_families genes) are put into families\n";
}
}
# NOW DISCONNECT FROM THE DATABASE:
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# READ THE LIST OF FULLY SEQUENCED SPECIES THAT WERE LOADED INTO THE
# TREEFAM MYSQL DATABASE:
sub read_fully_sequenced_species
{
my $dbh = $_[0];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my %SPECIES = ();
my $tax_id;
my $taxname;
my $swcode;
my %SWCODE = ();
$table_w = 'species';
$st = "SELECT TAX_ID, TAXNAME, SWCODE from $table_w where FLAG=1";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$TAX_ID = $array[0]; # eg. 7159
$taxname = $array[1]; # eg. Aedes aegypti
$swcode = $array[2];
if ($SPECIES{$TAX_ID})
{
if ($SPECIES{$TAX_ID} ne $taxname)
{
print STDERR "ERROR: TAX_ID $TAX_ID taxname $taxname SPECIES(TAX_ID) is $SPECIES{$TAX_ID}\n";
exit;
}
}
else
{
$SPECIES{$TAX_ID} = $taxname;
}
if ($SWCODE{$TAX_ID})
{
if ($SWCODE{$TAX_ID} ne $swcode)
{
print STDERR "ERROR: TAX_ID $TAX_ID swcode $swcode SWCODE(TAX_ID) is $SWCODE{$TAX_ID}\n";
exit;
}
}
else
{
$SWCODE{$TAX_ID} = $swcode;
}
}
}
print STDERR "Read in fully sequenced species\n";
return(\%SPECIES,\%SWCODE);
}
#------------------------------------------------------------------#
# FOR EACH OF THE FULLY SEQUENCED SPECIES LOADED INTO THE TREEFAM MYSQL
# DATABASE, SEE HOW MANY OF ITS GENES WERE PUT INTO FAMILIES:
sub read_genes_in_families
{
my $dbh = $_[0];
my $TAX_ID = $_[1];
my $taxname = $_[2];
my $swcode = $_[3];
my $release = $_[4];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $num_genes = 0;
my %SEEN_GENE = ();
my @ids;
my $i;
my $id;
my $num_genes_in_families = 0;
my %SEEN_GENE_IN_FAMILY = ();
my %GID = ();
# FIRST READ IN ALL THE GENES FROM THIS SPECIES IN THE 'genes' TABLE:
$table_w = 'genes';
$st = "SELECT ID, GID from $table_w WHERE TAX_ID=$TAX_ID";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$ID = $array[0];
@ids = (@ids,$ID);
$GID = $array[1];
if (!($SEEN_GENE{$GID}))
{
$SEEN_GENE{$GID} = 1;
$num_genes++;
}
if ($GID{$ID})
{
print STDERR "ERROR: already have GID $GID{$ID} for $ID (GID $GID)\n";
exit;
}
else
{
$GID{$ID} = $GID;
}
}
}
print STDERR "There are $num_genes genes from $taxname in the mysql \'genes\' table\n";
# NOW CHECK IF EACH OF THE GENES WAS PUT INTO A FAMILY:
for ($i = 0; $i <= $#ids; $i++)
{
$id = $ids[$i];
if (!($GID{$id})) { print STDERR "ERROR: do not know GID for $id\n"; exit;}
$GID = $GID{$id};
# IF IT IS RELEASE 8:
if ($release eq '8') { $id = $id."_".$swcode;}
$table_w = 'fam_genes';
$st = "SELECT AC, ID from $table_w where ID=\'$id\'";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
$ID = $array[1];
if ($ID ne $id) { print STDERR "ERROR: id $id ID $ID AC $AC GID $GID\n"; exit;}
if (!($SEEN_GENE_IN_FAMILY{$GID}))
{
$SEEN_GENE_IN_FAMILY{$GID} = 1;
$num_genes_in_families++;
}
}
}
}
print STDERR "There are $num_genes_in_families genes from $taxname in the mysql \'fam_genes\' table\n";
return($num_genes,$num_genes_in_families);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment