Created
March 1, 2013 14:56
-
-
Save avrilcoghlan/5065175 to your computer and use it in GitHub Desktop.
Perl script that finds cases where different alternative splices of the same gene were put into different families, but those alternative splice forms overlap a lot at the DNA level
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_QC11.pl | |
# Written by Avril Coghlan (a.coghlan@ucc.ie) | |
# 5-Feb-09. | |
# | |
# This perl script finds cases where different alternative splices of the | |
# same gene were put into different families, but those alternative splice | |
# forms overlap a lot at the DNA level. | |
# | |
# The command-line format is: | |
# % perl <treefam_QC11.pl> <release> | |
# where <release> is the release of the TreeFam database to use. | |
# | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 1) | |
{ | |
print "Usage of treefam_QC11.pl\n\n"; | |
print "perl treefam_QC11.pl <release>\n"; | |
print "where <release> is the release of the TreeFam database to use.\n"; | |
print "For example, >perl -w treefam_QC11.pl 7\n"; | |
exit; | |
} | |
#------------------------------------------------------------------# | |
# DECLARE MYSQL USERNAME AND HOST: | |
use DBI; | |
# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE: | |
$release = $ARGV[0]; | |
$VERBOSE = 1; | |
#------------------------------------------------------------------# | |
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308"; | |
$dbh = DBI->connect("$database", 'anonymous', '') || return; | |
# READ IN THE GENE ID FOR EVERY TRANSCRIPT ID: | |
$GID = &read_geneids($dbh); | |
# CHECK IF THERE ARE CASES WHERE DIFFERENT TRANSCRIPTS OF THE SAME | |
# GENE WERE ADDED TO DIFFERENT FAMILIES, WHERE THOSE TRANSCRIPTS | |
# OVERLAP A LOT AT THE DNA LEVEL: | |
&read_genes_in_families($dbh,$GID); | |
# NOW DISCONNECT FROM THE DATABASE: | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# READ THE GENE ID FOR EVERY TRANSCRIPT ID: | |
sub read_geneids | |
{ | |
my $dbh = $_[0]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $ID; | |
my $GID; | |
my %GID = (); | |
$table_w = 'genes'; | |
$st = "SELECT ID, GID from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$ID = $array[0]; | |
$GID = $array[1]; | |
if (!($GID{$ID})) | |
{ | |
$GID{$ID} = $GID; | |
} | |
else | |
{ | |
if ($GID{$ID} ne $GID) { print STDERR "ERROR: id $ID GID $GID GID(ID) $GID{$ID}\n"; exit;} | |
} | |
} | |
} | |
print STDERR "Read in gene ids\n"; | |
return(\%GID); | |
} | |
#------------------------------------------------------------------# | |
# CHECK IF TWO TRANSCRIPTS OVERLAP MUCH AT THE DNA LEVEL: | |
sub check_if_overlap | |
{ | |
my $dbh = $_[0]; | |
my $id1 = $_[1]; | |
my $id2 = $_[2]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $i; | |
my $id; | |
my $START; | |
my $STOP; | |
my $start1 = "none"; | |
my $start2 = "none"; | |
my $stop1 = "none"; | |
my $stop2 = "none"; | |
my @start1; | |
my @start2; | |
my @stop1; | |
my @stop2; | |
my $overlap = 0; | |
my $start_id1; | |
my $stop_id1; | |
my $start_id2; | |
my $stop_id2; | |
my $j; | |
my $length; | |
$table_w = 'map'; | |
for ($i = 1; $i <= 2; $i++) | |
{ | |
if ($i == 1) { $id = $id1;} | |
elsif ($i == 2) { $id = $id2;} | |
$st = "SELECT START,STOP from $table_w WHERE ID=\'$id\'"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$START = $array[0]; | |
$STOP = $array[1]; | |
if ($i == 1) { $start1 = $START; $stop1 = $STOP;} | |
elsif ($i == 2) { $start2 = $START; $stop2 = $STOP;} | |
} | |
} | |
} | |
if ($start1 eq 'none' || $stop1 eq 'none') { print STDERR "ERROR: do not know coords for $id1\n"; exit;} | |
if ($start2 eq 'none' || $stop2 eq 'none') { print STDERR "ERROR: do not know coords for $id2\n"; exit;} | |
@start1 = split(/\,/,$start1); | |
@start2 = split(/\,/,$start2); | |
@stop1 = split(/\,/,$stop1); | |
@stop2 = split(/\,/,$stop2); | |
for ($i = 0; $i <= $start1; $i++) | |
{ | |
$start_id1 = $start1[$i]; | |
$stop_id1 = $stop1[$i]; | |
# SEE IF THIS EXON OVERLAPS WITH ANY EXON IN $id2: | |
for ($j = 0; $j <= $start2; $j++) | |
{ | |
$start_id2 = $start2[$j]; | |
$stop_id2 = $stop2[$j]; | |
if (($start_id1 >= $start_id2 && $start_id1 <= $stop_id2) || | |
($stop_id1 >= $start_id2 && $stop_id1 <= $stop_id2) || | |
($start_id2 >= $start_id1 && $start_id2 <= $stop_id1) || | |
($stop_id2 >= $start_id1 && $stop_id2 <= $stop_id1)) | |
{ | |
if ($start_id1 >= $start_id2 && $stop_id1 <= $stop_id2) { $length = $stop_id1 - $start_id1 + 1;} | |
elsif ($start_id2 >= $start_id1 && $stop_id2 <= $stop_id1) { $length = $stop_id2 - $start_id2 + 1;} | |
elsif ($start_id1 >= $start_id2 && $stop_id2 <= $stop_id1) { $length = $stop_id2 - $start_id1 + 1;} | |
elsif ($start_id2 >= $start_id1 && $stop_id1 <= $stop_id2) { $length = $stop_id1 - $start_id2 + 1;} | |
$overlap = $overlap + $length; | |
} | |
} | |
} | |
return($overlap); | |
} | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE CASES WHERE DIFFERENT TRANSCRIPTS OF THE SAME GENE | |
# WERE ADDED TO DIFFERENT FAMILIES, BUT THE DIFFERENT TRANSCRIPTS OVERLAP | |
# A LOT AT THE DNA LEVEL: | |
sub read_genes_in_families | |
{ | |
my $dbh = $_[0]; | |
my $GID = $_[1]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $FLAG; | |
my %FAMILY = (); | |
my $geneid; | |
my %SEEN; | |
my $families; | |
my @families; | |
my $i; | |
my $j; | |
my $family1; | |
my $family2; | |
my @temp; | |
my $id1; | |
my $id2; | |
$table_w = 'fam_genes'; | |
$st = "SELECT AC, ID, FLAG from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
$ID = $array[1]; | |
$FLAG = $array[2]; | |
# GET THE GENE ID FOR $ID: | |
if (!($GID->{$ID})) | |
{ | |
# THERE ARE SOME IDs IN THE fam_genes TABLE THAT DO NOT APPEAR IN THE genes | |
# TABLE BECAUSE THEY BELONGED TO AN OLD TREEFAM-A SEED FAMILY AND ARE NOT IN | |
# THE LATEST GENE SET FOR THAT SPECIES LOADED INTO THE genes TABLE. | |
print STDERR "WARNING: do not know gene id for $ID (AC $AC FLAG $FLAG)\n"; | |
} | |
else | |
{ | |
$geneid = $GID->{$ID}; | |
# RECORD THE FAMILIES THAT THIS GENE APPEARS IN: | |
if (!($SEEN{$AC."_".$ID})) | |
{ | |
$SEEN{$AC."_".$ID} = 1; | |
if (!($FAMILY{$geneid})) | |
{ | |
$FAMILY{$geneid} = $AC."_".$ID; | |
} | |
else | |
{ | |
$FAMILY{$geneid} = $FAMILY{$geneid}.",".$AC."_".$ID; | |
} | |
} | |
} | |
} | |
} | |
# CHECK WHETHER THERE ARE CASES WHERE DIFFERENT TRANSCRIPTS OF THE | |
# SAME GENE APPEAR IN DIFFERENT FAMILIES, BUT THOSE TRANSCRIPTS | |
# OVERLAP A LOT AT THE DNA LEVEL: | |
foreach $geneid (keys %FAMILY) | |
{ | |
$families = $FAMILY{$geneid}; | |
@families = split(/\,/,$families); | |
# LOOK AT EACH PAIR OF TRANSCRIPTS, WHICH ARE IN DIFFERENT FAMILIES: | |
for ($i = 0; $i <= $#families; $i++) | |
{ | |
$family1 = $families[$i]; | |
@temp = split(/_/,$family1); | |
$family1 = $temp[0]; | |
$id1 = $temp[1]; | |
for ($j = 0; $j <= $#families; $j++) | |
{ | |
$family2 = $families[$j]; | |
@temp = split(/_/,$family2); | |
$family2 = $temp[0]; | |
$id2 = $temp[1]; | |
if ($family1 ne $family2 && $id1 ne $id2) | |
{ | |
# CHECK WHETHER $id1 AND $id2 OVERLAP A LOT AT THE DNA LEVEL: | |
#xxxyyyzzz | |
# xxxyyyzzz | |
$overlap = &check_if_overlap($dbh,$id1,$id2); #xxx | |
print "xxx $id1 (family $family1) overlaps by $overlap with $id2 ($family2)\n"; | |
} | |
} | |
} | |
} | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment