Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Perl script that finds TreeFam proteins that were added to a particular family, but actually have a stronger hmmer match to a different family.
#!/usr/local/bin/perl
#
# Perl script treefam_QC10.pl
# Written by Avril Coghlan (a.coghlan@ucc.ie)
# 4-Feb-09.
#
# This perl script finds TreeFam proteins were added to a particular
# family, but actually have a stronger hmmer match to a different family.
#
# The command-line format is:
# % perl <treefam_QC10.pl> <release>
# where <release> is the release of the TreeFam database to use.
#
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 1)
{
print "Usage of treefam_QC10.pl\n\n";
print "perl treefam_QC10.pl <release>\n";
print "where <release> is the release of the TreeFam database to use.\n";
print "For example, >perl -w treefam_QC10.pl 7\n";
exit;
}
#------------------------------------------------------------------#
# DECLARE MYSQL USERNAME AND HOST:
use DBI;
# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE:
$release = $ARGV[0];
#------------------------------------------------------------------#
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
$dbh = DBI->connect("$database", 'anonymous', '') || return;
# READ IN THE HMMER MATCHES BETWEEN TRANSCRIPTS AND HMMS FOR FAMILIES:
($HMM,$HMMEVAL,$HMMEVAL2,$HMMSCORE,$HMMSCORE2) = &read_hmmer_matches_to_families($dbh);
# GET A LIST OF ALL TRANSCRIPTS THAT WERE PUT INTO FAMILIES, AND CHECK
# THAT IF A TRANSCRIPT WAS PUT INTO A FAMILY, THAT IT DOESN'T HAVE
# A STRONGER HMMER MATCH TO ANOTHER FAMILY:
&read_transcripts_in_families($dbh,$HMM,$HMMEVAL,$HMMEVAL2,$HMMSCORE,$HMMSCORE2);
# NOW DISCONNECT FROM THE DATABASE:
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# READ IN THE HMMER MATCHES BETWEEN TRANSCRIPTS AND HMMS FOR FAMILIES:
sub read_hmmer_matches_to_families
{
my $dbh = $_[0];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $AC;
my $ID;
my $SCORE;
my $EVALUE;
my %HMM = ();
my %HMMEVAL = ();
my %HMMSCORE = ();
my $hmmeval;
my $hmmscore;
my %HMMEVAL2 = ();
my %HMMSCORE2 = ();
$table_w = 'hmmer_matches';
$st = "SELECT AC, ID, SCORE, EVALUE from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
$ID = $array[1];
$SCORE = $array[2];
$EVALUE = $array[3];
# RECORD THE BEST HMMER MATCH FOR THIS TRANSCRIPT:
if (!($HMM{$ID}))
{
$HMM{$ID} = $AC;
if ($EVALUE == 0) { $EVALUE = "zero";}
if ($SCORE == 0) { $SCORE = "zero";}
$HMMEVAL{$ID} = $EVALUE;
$HMMSCORE{$ID} = $SCORE;
$HMMEVAL2{$ID."_".$AC} = $EVALUE;
$HMMSCORE2{$ID."_".$AC} = $SCORE;
}
else
{
if (!($HMMEVAL{$ID})) { print STDERR "ERROR: do not know hmmeval for $ID HMM $HMM{$ID}\n"; exit;}
$hmmeval = $HMMEVAL{$ID};
if (!($HMMSCORE{$ID})) { print STDERR "ERROR: do not know hmmscore for $ID HMM $HMM{$ID}\n"; exit;}
$hmmscore = $HMMSCORE{$ID};
if ($hmmeval eq 'zero') { $hmmeval = 0;}
if ($hmmscore eq 'zero') { $hmmscore = 0;}
if ($EVALUE < $hmmeval || ($EVALUE == $hmmeval && $SCORE > $hmmscore))
{
$HMM{$ID} = $AC;
if ($EVALUE == 0) { $EVALUE = "zero";}
if ($SCORE == 0) { $SCORE = "zero";}
$HMMEVAL{$ID} = $EVALUE;
$HMMSCORE{$ID} = $SCORE;
$HMMEVAL2{$ID."_".$AC} = $EVALUE;
$HMMSCORE2{$ID."_".$AC} = $SCORE;
}
else
{
if ($EVALUE == 0) { $EVALUE = "zero";}
if ($SCORE == 0) { $SCORE = "zero";}
$HMMEVAL2{$ID."_".$AC} = $EVALUE;
$HMMSCORE2{$ID."_".$AC} = $SCORE;
}
}
}
}
print STDERR "Read in HMM matches for transcripts\n";
print "Read in HMM matches for transcripts\n";
return(\%HMM,\%HMMEVAL,\%HMMEVAL2,\%HMMSCORE,\%HMMSCORE2);
}
#------------------------------------------------------------------#
# READ THE TRANSCRIPTS THAT WERE PUT INTO FAMILIES:
sub read_transcripts_in_families
{
my $dbh = $_[0];
my $HMM = $_[1];
my $HMMEVAL = $_[2];
my $HMMEVAL2 = $_[3];
my $HMMSCORE = $_[4];
my $HMMSCORE2 = $_[5];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $FLAG;
my $AC;
my $AVeval;
my $ACscore;
my $ID;
my $family;
my $familyeval;
my $familyscore;
$table_w = 'fam_genes';
$st = "SELECT AC, FLAG, ID from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
$FLAG = $array[1];
$ID = $array[2];
# THIS TABLE 'fam_genes' HAS 'FULL'/'SEED' FAMILIES:
if ($FLAG eq 'FULL') # IF THIS IS A 'FULL' TREEFAM FAMILY (NOT A 'SEED' FAMILY).
{
# CHECK THAT THIS IS THE FAMILY THAT $ID HAS THE STRONGEST
# HMMER MATCH TO:
if ($HMM->{$ID})
{
$family = $HMM->{$ID};
if (!($HMMEVAL->{$ID})) { print STDERR "ERROR: do not know evalue for $ID\n"; exit;}
if (!($HMMSCORE->{$ID})) { print STDERR "ERROR: do not know score for $ID\n"; exit;}
$familyeval = $HMMEVAL->{$ID};
$familyscore = $HMMSCORE->{$ID};
if ($familyeval eq 'zero') { $familyeval = 0;}
if ($familyscore eq 'zero') { $familyscore = 0;}
if (!($HMMEVAL2->{$ID."_".$AC})) { print STDERR "ERROR: do not know evalue for $ID $AC\n"; exit;}
if (!($HMMSCORE2->{$ID."_".$AC})) { print STDERR "ERROR: do not know score for $ID $AC\n"; exit;}
$ACeval = $HMMEVAL2->{$ID."_".$AC};
$ACscore = $HMMSCORE2->{$ID."_".$AC};
if ($ACeval eq 'zero') { $ACeval = 0;}
if ($ACscore eq 'zero') { $ACscore = 0;}
if ($family ne $AC)
{
print "WARNING: $ID was added to family $AC (eval $ACeval, score $ACscore) but has stronger match to $family (eval $familyeval, score $familyscore)\n";
}
}
else
{
print "WARNING: $ID was added to family $AC but has no hmmer match to any family\n";
}
}
}
}
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment