Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 14:49
Show Gist options
  • Save avrilcoghlan/5065124 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065124 to your computer and use it in GitHub Desktop.
Perl script that finds cases where different alternative spliceforms of the same gene do not have unique transcript ids in the 'genes' table of the TreeFam mysql database
#!/usr/local/bin/perl
#
# Perl script treefam_QC7.pl
# Written by Avril Coghlan (a.coghlan@ucc.ie)
# 4-Feb-09.
#
# This perl script finds cases where more different alternative splices of
# the same gene do not have unique transcript ids in the 'genes' table.
#
# The command-line format is:
# % perl <treefam_QC7.pl> <release>
# where <release> is the release of the TreeFam database to use.
#
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 1)
{
print "Usage of treefam_QC7.pl\n\n";
print "perl treefam_QC7.pl <release>\n";
print "where <release> is the release of the TreeFam database to use.\n";
print "For example, >perl -w treefam_QC7.pl 7\n";
exit;
}
#------------------------------------------------------------------#
# DECLARE MYSQL USERNAME AND HOST:
use DBI;
# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE:
$release = $ARGV[0];
$VERBOSE = 1;
#------------------------------------------------------------------#
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
$dbh = DBI->connect("$database", 'anonymous', '') || return;
# READ IN THE GENE ID FOR EVERY TRANSCRIPT ID, AND CHECK THAT EACH
# TRANSCRIPT HAS A UNIQUE TRANSCRIPT ID:
&read_geneids($dbh);
# NOW DISCONNECT FROM THE DATABASE:
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# READ THE GENE ID FOR EVERY TRANSCRIPT ID, AND CHECK THAT EACH TRANSCRIPT
# HAS A UNIQUE TRANSCRIPT ID:
sub read_geneids
{
my $dbh = $_[0];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $ID;
my $GID;
my %SEEN = ();
$table_w = 'genes';
$st = "SELECT ID, GID from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$ID = $array[0];
$GID = $array[1];
if (!($SEEN{$ID}))
{
$SEEN{$ID} = 1;
}
else
{
print "WARNING: id $ID (gene $GID) is not unique in the genes table\n";
}
}
}
print STDERR "Read in gene ids\n";
print "Read in gene ids\n";
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment