Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 14:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5065052 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065052 to your computer and use it in GitHub Desktop.
Perl script that finds TreeFam proteins that have a strong match to a family in the TreeFam mysql hmmer_matches table, but where the gene was not added to the 'fam_genes' table for the family or to any family.
#!/usr/local/bin/perl
#
# Perl script treefam_QC1.pl
# Written by Avril Coghlan (a.coghlan:ucc.ie)
# 9-Dec-08.
# Edited 9-Jan-09.
#
# This perl script finds TreeFam proteins that have a strong match to a family
# in the treefam mysql hmmer_matches table, but where the gene was not added to the
# 'fam_genes' table for the family or to any family.
#
# The command-line format is:
# % perl <treefam_QC1.pl> <release>
# where <release> is the release of the TreeFam database to use.
#
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 1)
{
print "Usage of treefam_QC1.pl\n\n";
print "perl treefam_QC1.pl <release>\n";
print "where <release> is the release of the TreeFam database to use.\n";
print "For example, >perl -w treefam_QC1.pl 7\n";
exit;
}
#------------------------------------------------------------------#
# DECLARE MYSQL USERNAME AND HOST:
use DBI;
# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE:
$release = $ARGV[0];
#------------------------------------------------------------------#
# READ IN THE PROTEINS CORRESPONDING TO EACH GENE:
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
$dbh = DBI->connect("$database", 'anonymous', '') || return;
$GENE = &read_proteins_for_genes($dbh);
# GET A LIST OF ALL GENES THAT WERE PUT INTO FAMILIES:
$FAMILY = &read_genes_in_families($dbh,$GENE);
# READ IN THE HMMER MATCHES BETWEEN GENES AND HMMS FOR FAMILIES, AND
# CHECK THAT GENES WITH STRONG HMMER MATCHES TO A FAMILY APPEAR IN THAT
# FAMILY IN THE 'fam_genes' TABLE:
&read_hmmer_matches_to_families($dbh,$FAMILY,$GENE);
# NOW DISCONNECT FROM THE DATABASE:
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# READ IN THE PROTEINS CORRESPONDING TO EACH GENE:
sub read_proteins_for_genes
{
my $dbh = $_[0];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $GID;
my $ID;
my %GENE = ();
$table_w = 'genes';
$st = "SELECT GID, ID from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$GID = $array[0];
$ID = $array[1];
if ($GENE{$ID})
{
print STDERR "WARNING: have multiple gids for $ID\n";
$GENE{$ID} = $GID;
}
else
{
$GENE{$ID} = $GID;
}
}
}
print STDERR "Read in gene id for each protein id\n";
return(\%GENE);
}
#------------------------------------------------------------------#
# READ IN THE HMMER MATCHES BETWEEN GENES AND HMMS FOR FAMILIES, AND
# CHECK THAT GENES WITH STRONG HMMER MATCHES TO A FAMILY APPEAR IN THAT
# FAMILY IN THE 'fam_genes' TABLE:
sub read_hmmer_matches_to_families
{
my $dbh = $_[0];
my $FAMILY = $_[1];
my $GENE = $_[2];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $AC;
my $ID;
my $SCORE;
my $EVALUE;
my $family;
my $evalue_cutoff = 1e-10; # xxx
my $GID;
$table_w = 'hmmer_matches';
$st = "SELECT AC, ID, SCORE, EVALUE from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
$ID = $array[1];
$SCORE = $array[2];
$EVALUE = $array[3];
# FIND THE GENE ID FOR PROTEIN $ID:
if (!($GENE->{$ID}))
{
print STDERR "WARNING: do not know GID for protein $ID\n";
}
else
{
$GID = $GENE->{$ID};
# CHECK IF THE GENE $ID WAS NOT PUT INTO A FAMILY:
if (!($FAMILY->{$GID}))
{
# IF THE EVALUE IS VERY SIGNIFICANT FOR THE HMMER MATCH,
# WE SHOULD PROBABLY HAVE PUT THIS GENE INTO FAMILY $AC:
if ($EVALUE <= $evalue_cutoff)
{
print STDERR "WARNING: Should have put $ID (gene $GID) into family $AC (hmmer e-value $EVALUE, score $SCORE)\n";
}
}
}
}
}
}
#------------------------------------------------------------------#
# READ THE GENES THAT WERE PUT INTO FAMILIES:
sub read_genes_in_families
{
my $dbh = $_[0];
my $GENE = $_[1];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $FLAG;
my $AC;
my %FAMILY = ();
my $GID;
my @temp;
my $i;
$table_w = 'fam_genes';
$st = "SELECT AC, FLAG, ID from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
$FLAG = $array[1];
$ID = $array[2]; # eg. AAEL009572-RA.1_AEDAE in TreeFam-8
@temp = split(/_/,$ID);
$ID = "";
for ($i = 0; $i <= $#temp-1; $i++) { $ID = $ID."_".$temp[$i];}
$ID = substr($ID,1,length($ID)-1);
# THIS TABLE 'fam_genes' HAS 'FULL'/'SEED' FAMILIES:
if ($FLAG eq 'FULL') # IF THIS IS A 'FULL' TREEFAM FAMILY (NOT A 'SEED' FAMILY).
{
# FIND THE GENE ID CORRESPONDING TO THIS PROTEIN ID:
if (!($GENE->{$ID})) { print STDERR "WARNING: do not know gene for $ID AC $AC FLAG $FLAG\n"; }
else
{
$GID = $GENE->{$ID};
if ($FAMILY{$GID})
{
$FAMILY{$GID} = $FAMILY{$GID}.",".$AC;
}
else
{
$FAMILY{$GID}= $AC;
}
}
}
}
}
print STDERR "Read in genes in each family\n";
return(\%FAMILY);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment