Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 14:56
Perl script that finds cases where different alternative splices of the same gene were put into different families, but those alternative splice forms overlap a lot at the DNA level
#!/usr/local/bin/perl
#
# Perl script treefam_QC11.pl
# Written by Avril Coghlan (a.coghlan@ucc.ie)
# 5-Feb-09.
#
# This perl script finds cases where different alternative splices of the
# same gene were put into different families, but those alternative splice
# forms overlap a lot at the DNA level.
#
# The command-line format is:
# % perl <treefam_QC11.pl> <release>
# where <release> is the release of the TreeFam database to use.
#
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 1)
{
print "Usage of treefam_QC11.pl\n\n";
print "perl treefam_QC11.pl <release>\n";
print "where <release> is the release of the TreeFam database to use.\n";
print "For example, >perl -w treefam_QC11.pl 7\n";
exit;
}
#------------------------------------------------------------------#
# DECLARE MYSQL USERNAME AND HOST:
use DBI;
# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE:
$release = $ARGV[0];
$VERBOSE = 1;
#------------------------------------------------------------------#
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
$dbh = DBI->connect("$database", 'anonymous', '') || return;
# READ IN THE GENE ID FOR EVERY TRANSCRIPT ID:
$GID = &read_geneids($dbh);
# CHECK IF THERE ARE CASES WHERE DIFFERENT TRANSCRIPTS OF THE SAME
# GENE WERE ADDED TO DIFFERENT FAMILIES, WHERE THOSE TRANSCRIPTS
# OVERLAP A LOT AT THE DNA LEVEL:
&read_genes_in_families($dbh,$GID);
# NOW DISCONNECT FROM THE DATABASE:
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# READ THE GENE ID FOR EVERY TRANSCRIPT ID:
sub read_geneids
{
my $dbh = $_[0];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $ID;
my $GID;
my %GID = ();
$table_w = 'genes';
$st = "SELECT ID, GID from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$ID = $array[0];
$GID = $array[1];
if (!($GID{$ID}))
{
$GID{$ID} = $GID;
}
else
{
if ($GID{$ID} ne $GID) { print STDERR "ERROR: id $ID GID $GID GID(ID) $GID{$ID}\n"; exit;}
}
}
}
print STDERR "Read in gene ids\n";
return(\%GID);
}
#------------------------------------------------------------------#
# CHECK IF TWO TRANSCRIPTS OVERLAP MUCH AT THE DNA LEVEL:
sub check_if_overlap
{
my $dbh = $_[0];
my $id1 = $_[1];
my $id2 = $_[2];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $i;
my $id;
my $START;
my $STOP;
my $start1 = "none";
my $start2 = "none";
my $stop1 = "none";
my $stop2 = "none";
my @start1;
my @start2;
my @stop1;
my @stop2;
my $overlap = 0;
my $start_id1;
my $stop_id1;
my $start_id2;
my $stop_id2;
my $j;
my $length;
$table_w = 'map';
for ($i = 1; $i <= 2; $i++)
{
if ($i == 1) { $id = $id1;}
elsif ($i == 2) { $id = $id2;}
$st = "SELECT START,STOP from $table_w WHERE ID=\'$id\'";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$START = $array[0];
$STOP = $array[1];
if ($i == 1) { $start1 = $START; $stop1 = $STOP;}
elsif ($i == 2) { $start2 = $START; $stop2 = $STOP;}
}
}
}
if ($start1 eq 'none' || $stop1 eq 'none') { print STDERR "ERROR: do not know coords for $id1\n"; exit;}
if ($start2 eq 'none' || $stop2 eq 'none') { print STDERR "ERROR: do not know coords for $id2\n"; exit;}
@start1 = split(/\,/,$start1);
@start2 = split(/\,/,$start2);
@stop1 = split(/\,/,$stop1);
@stop2 = split(/\,/,$stop2);
for ($i = 0; $i <= $start1; $i++)
{
$start_id1 = $start1[$i];
$stop_id1 = $stop1[$i];
# SEE IF THIS EXON OVERLAPS WITH ANY EXON IN $id2:
for ($j = 0; $j <= $start2; $j++)
{
$start_id2 = $start2[$j];
$stop_id2 = $stop2[$j];
if (($start_id1 >= $start_id2 && $start_id1 <= $stop_id2) ||
($stop_id1 >= $start_id2 && $stop_id1 <= $stop_id2) ||
($start_id2 >= $start_id1 && $start_id2 <= $stop_id1) ||
($stop_id2 >= $start_id1 && $stop_id2 <= $stop_id1))
{
if ($start_id1 >= $start_id2 && $stop_id1 <= $stop_id2) { $length = $stop_id1 - $start_id1 + 1;}
elsif ($start_id2 >= $start_id1 && $stop_id2 <= $stop_id1) { $length = $stop_id2 - $start_id2 + 1;}
elsif ($start_id1 >= $start_id2 && $stop_id2 <= $stop_id1) { $length = $stop_id2 - $start_id1 + 1;}
elsif ($start_id2 >= $start_id1 && $stop_id1 <= $stop_id2) { $length = $stop_id1 - $start_id2 + 1;}
$overlap = $overlap + $length;
}
}
}
return($overlap);
}
#------------------------------------------------------------------#
# CHECK IF THERE ARE CASES WHERE DIFFERENT TRANSCRIPTS OF THE SAME GENE
# WERE ADDED TO DIFFERENT FAMILIES, BUT THE DIFFERENT TRANSCRIPTS OVERLAP
# A LOT AT THE DNA LEVEL:
sub read_genes_in_families
{
my $dbh = $_[0];
my $GID = $_[1];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $FLAG;
my %FAMILY = ();
my $geneid;
my %SEEN;
my $families;
my @families;
my $i;
my $j;
my $family1;
my $family2;
my @temp;
my $id1;
my $id2;
$table_w = 'fam_genes';
$st = "SELECT AC, ID, FLAG from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
$ID = $array[1];
$FLAG = $array[2];
# GET THE GENE ID FOR $ID:
if (!($GID->{$ID}))
{
# THERE ARE SOME IDs IN THE fam_genes TABLE THAT DO NOT APPEAR IN THE genes
# TABLE BECAUSE THEY BELONGED TO AN OLD TREEFAM-A SEED FAMILY AND ARE NOT IN
# THE LATEST GENE SET FOR THAT SPECIES LOADED INTO THE genes TABLE.
print STDERR "WARNING: do not know gene id for $ID (AC $AC FLAG $FLAG)\n";
}
else
{
$geneid = $GID->{$ID};
# RECORD THE FAMILIES THAT THIS GENE APPEARS IN:
if (!($SEEN{$AC."_".$ID}))
{
$SEEN{$AC."_".$ID} = 1;
if (!($FAMILY{$geneid}))
{
$FAMILY{$geneid} = $AC."_".$ID;
}
else
{
$FAMILY{$geneid} = $FAMILY{$geneid}.",".$AC."_".$ID;
}
}
}
}
}
# CHECK WHETHER THERE ARE CASES WHERE DIFFERENT TRANSCRIPTS OF THE
# SAME GENE APPEAR IN DIFFERENT FAMILIES, BUT THOSE TRANSCRIPTS
# OVERLAP A LOT AT THE DNA LEVEL:
foreach $geneid (keys %FAMILY)
{
$families = $FAMILY{$geneid};
@families = split(/\,/,$families);
# LOOK AT EACH PAIR OF TRANSCRIPTS, WHICH ARE IN DIFFERENT FAMILIES:
for ($i = 0; $i <= $#families; $i++)
{
$family1 = $families[$i];
@temp = split(/_/,$family1);
$family1 = $temp[0];
$id1 = $temp[1];
for ($j = 0; $j <= $#families; $j++)
{
$family2 = $families[$j];
@temp = split(/_/,$family2);
$family2 = $temp[0];
$id2 = $temp[1];
if ($family1 ne $family2 && $id1 ne $id2)
{
# CHECK WHETHER $id1 AND $id2 OVERLAP A LOT AT THE DNA LEVEL:
#xxxyyyzzz
# xxxyyyzzz
$overlap = &check_if_overlap($dbh,$id1,$id2); #xxx
print "xxx $id1 (family $family1) overlaps by $overlap with $id2 ($family2)\n";
}
}
}
}
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment