Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 14:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5065139 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065139 to your computer and use it in GitHub Desktop.
Perl script that finds cases where a transcript listed in the 'genes' table of the TreeFam mysql database lacks any amino acid sequence in the 'aa_seq' table, or lacks a DNA sequence in the 'nt_seq' table.
#!/usr/local/bin/perl
#
# Perl script treefam_QC8.pl
# Written by Avril Coghlan (a.coghlan@ucc.ie)
# 4-Feb-09.
#
# This perl script finds cases where a transript listed in the 'genes' table
# is missing an amino acid sequence in the 'aa_seq' table or missing a
# DNA sequence in the 'nt_seq' table.
#
# The command-line format is:
# % perl <treefam_QC8.pl> <release>
# where <release> is the release of the TreeFam database to use.
#
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 1)
{
print "Usage of treefam_QC8.pl\n\n";
print "perl treefam_QC8.pl <release>\n";
print "where <release> is the release of the TreeFam database to use.\n";
print "For example, >perl -w treefam_QC8.pl 7\n";
exit;
}
#------------------------------------------------------------------#
# DECLARE MYSQL USERNAME AND HOST:
use DBI;
# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE:
$release = $ARGV[0];
$VERBOSE = 1;
#------------------------------------------------------------------#
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
$dbh = DBI->connect("$database", 'anonymous', '') || return;
# READ IN ALL THE TRANSCRIPTS THAT HAVE AMINO ACID SEQUENCES IN THE
# aa_seq TABLE:
$AASEQ = &read_aaseq($dbh);
# READ IN ALL THE TRANSCRIPTS THAT HAVE DNA SEQUENCES IN THE
# nt_seq TABLE:
$DNASEQ = &read_dnaseq($dbh);
# CHECK THAT ALL OF THE GENES IN THE genes TABLE HAVE DNA AND AMINO
# ACID SEQUENCES:
&read_genes($dbh,$AASEQ,$DNASEQ);
# NOW DISCONNECT FROM THE DATABASE:
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# READ IN ALL THE TRANSCRIPTS THAT HAVE DNA SEQUENCES IN THE
# nt_seq TABLE:
sub read_dnaseq
{
my $dbh = $_[0];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $ID;
my %DNASEQ = ();
$table_w = 'nt_seq';
$st = "SELECT ID from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$ID = $array[0];
$DNASEQ{$ID} = 1;
}
}
print STDERR "Read in transcripts with DNA sequences\n";
print "Read in transcripts with DNA sequences\n";
return(\%DNASEQ);
}
#------------------------------------------------------------------#
# READ IN ALL THE TRANSCRIPTS THAT HAVE AMINO ACID SEQUENCES IN THE
# aa_seq TABLE:
sub read_aaseq
{
my $dbh = $_[0];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $ID;
my %AASEQ = ();
$table_w = 'aa_seq';
$st = "SELECT ID from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$ID = $array[0];
$AASEQ{$ID} = 1;
}
}
print STDERR "Read in transcripts with amino acid sequences\n";
print "Read in transcripts with amino acid sequences\n";
return(\%AASEQ);
}
#------------------------------------------------------------------#
# CHECK THAT ALL OF THE TRANSCRIPTS IN THE genes TABLE HAVE DNA AND
# AMINO ACID SEQUENCES:
sub read_genes
{
my $dbh = $_[0];
my $AASEQ = $_[1];
my $DNASEQ = $_[2];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $ID;
$table_w = 'genes';
$st = "SELECT ID from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$ID = $array[0];
if (!($AASEQ->{$ID}))
{
print "WARNING: ID $ID appears in the genes table, but has no amino acid sequence in the aa_seq table\n";
}
if (!($DNASEQ->{$ID}))
{
print "WARNING: ID $ID appears in the genes table, but has no DNA sequence in the nt_seq table\n";
}
}
}
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment