Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5065617 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065617 to your computer and use it in GitHub Desktop.
Perl script that, given a gff file for Caenorhabditis elegans, finds the fraction of introns in each Caenorhabditis elegans gene that are shared in position in the ortholog (in TreeFam) in C. briggsae, human or yeast.
#!/usr/local/bin/perl
#
# Perl script find_intron_cons_treefam_ortholog.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 7-Dec-07.
#
# This perls scripts finds the fraction of introns in a C. elegans
# gene that are shared in position in the TreeFam ortholog in C. briggsae,
# human, yeast for each C. elegans gene in the ngasp benchmark gene set.
#
# The command-line format is:
# % perl <find_intron_cons_treefam_ortholog.pl> gff
# where gff is the input benchmark gff file.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 1)
{
print "Usage of find_intron_cons_treefam_ortholog.pl\n\n";
print "perl find_intron_cons_treefam_ortholog.pl <gff>\n";
print "where <gff> is the input benchmark gff file.\n";
print "For example, >perl find_intron_cons_treefam_ortholog.pl phase1_confirmed_isoforms_A.gff.1\n";
exit;
}
# READ IN MY PERL MODULES:
BEGIN {
unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
}
use Avril_modules;
use DBI;
# FIND THE NAME OF THE INPUT GFF FILE:
$gff = $ARGV[0];
$VERBOSE = 0;
#------------------------------------------------------------------#
# READ IN THE INPUT GFF FILE:
%TAKE = ();
open(GFF,"$gff") || die "ERROR: cannot open $gff.\n";
while(<GFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
$exon = $temp[8];
# FIND THE GENE NAME:
# eg. ID=CDS:Transcript:C49A1.4b.1;Name=C49A1.4b.1;Parent=Transcript:C49A1.4b.1
@temp = split(/\;/,$exon);
$gene = $temp[1]; # Name=C49A1.4b.1 THIS IS THE TRANSCRIPT NAME.
@temp = split(/\./,$gene);
$gene = $temp[0].".".$temp[1]; # eg. Name=C49A1.4b
@temp = split(/=/,$gene);
$gene = $temp[1]; # eg. C49A1.4b
$gene =~ tr/[a-z]/[A-Z]/;
if (substr($gene,length($gene)-1,1) =~ /[A-Z]/) { chop($gene);} # eg. C49A1.4 THIS IS THE GENE NAME.
$TAKE{$gene} = 1;
}
close(GFF);
#------------------------------------------------------------------#
# FIND THE TREEFAM ID FOR ALL THE GENES FROM C. ELEGANS THAT I AM INTERESTED IN:
# HASH TABLES TO REMEMBER THE SPECIES OF GENES:
$dbh = DBI->connect("dbi:mysql:treefam_5:db.treefam.org:3308", 'anonymous', '') || return;
# SPECIFY THE TABLE:
$table_w = 'genes';
# THE FIRST COLUMN IN THIS TABLE IS AN IDENTIFIER (IDX),
# ANOTHER COLUMN IS THE TRANSCRIPT NAME, AND THE LAST COLUMN
# IS THE TAXONOMY ID, eg. 1 ENSANGT00000032162.1 7165
$st = "SELECT IDX, TAX_ID, GID, ID from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
%IDX = ();
%ID = ();
%YEAST = ();
%CB = ();
%CR = ();
%HUMAN = ();
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$IDX = $array[0]; # eg., 1
$TAXID = $array[1]; # eg 7165
$GID = $array[2];
$ID = $array[3];
# CHECK IF THIS IS A C. ELEGANS GENE THAT WE WANT TO TAKE:
if ($TAKE{$GID})
{
if (!($IDX{$GID})) { $IDX{$GID} = $IDX;}
else {$IDX{$GID} = $IDX{$GID}.",".$IDX;}
if (!($ID{$GID})) { $ID{$GID} = $ID;}
else {$ID{$GID} = $ID{$GID}.",".$ID;}
}
# RECORD IF IT IS A YEAST GENE:
if ($TAXID eq '4932')
{
$YEAST{$IDX} = $GID;
if (!($ID{$GID})) { $ID{$GID} = $ID;}
else {$ID{$GID} = $ID{$GID}.",".$ID;}
}
# RECORD IF IT IS A C. BRIGGSAE GENE:
if ($TAXID eq '6238')
{
$CB{$IDX} = $GID;
if (!($ID{$GID})) { $ID{$GID} = $ID;}
else {$ID{$GID} = $ID{$GID}.",".$ID;}
}
# RECORD IF IT IS A C. REMANEI GENE:
if ($TAXID eq '31234')
{
$CR{$IDX} = $GID;
if (!($ID{$GID})) { $ID{$GID} = $ID;}
else {$ID{$GID} = $ID{$GID}.",".$ID;}
}
# RECORD IF IT IS A HUMAN GENE:
if ($TAXID eq '9606')
{
$HUMAN{$IDX} = $GID;
if (!($ID{$GID})) { $ID{$GID} = $ID;}
else {$ID{$GID} = $ID{$GID}.",".$ID;}
}
}
}
# NOW FIND THE FAMILIES FOR THE C. ELEGANS GENES OF INTEREST:
%FAM = ();
%SEEN = ();
foreach $GID (keys %TAKE)
{
$IDS = $ID{$GID};
@IDS = split(/\,/,$IDS);
for ($i = 0; $i <= $#IDS; $i++)
{
$ID = $IDS[$i];
# GET THE FAMILY FOR THE ID $ID:
$table_w = 'fam_genes';
$st = "SELECT ID,AC,FLAG,FAM_TYPE from $table_w WHERE ID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($ID) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[1];
$FLAG = $array[2];
$FAM_TYPE = $array[3];
if ($FLAG eq 'FULL' & ($FAM_TYPE eq 'A' || $FAM_TYPE eq 'B'))
{
if (!($SEEN{$GID."_".$AC}))
{
if (!($FAM{$GID})) { $FAM{$GID} = $AC; }
else { print STDERR "$GID is in $FAM{$GID} and $AC\n"; exit;}
}
}
}
}
}
}
# NOW FIND ALL THE ORTHOLOGS FOR THE C. ELEGANS GENES OF INTEREST:
%YEAST_ORTH = ();
%CB_ORTH = ();
%CR_ORTH = ();
%HUMAN_ORTH = ();
foreach $GID (keys %IDX)
{
$IDXS = $IDX{$GID};
@IDXS = split(/\,/,$IDXS);
for ($i = 0; $i <= $#IDXS; $i++)
{
$IDX = $IDXS[$i];
# GET ALL THE ORTHOLOGS FOR $IDX:
$table_w = 'ortholog';
$st = "SELECT idx1,idx2 from $table_w WHERE idx1=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($IDX) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$idx1 = $array[0];
$idx2 = $array[1];
if ($YEAST{$idx2})
{
if (!($YEAST_ORTH{$GID})) { $YEAST_ORTH{$GID} = $YEAST{$idx2};}
else {$YEAST_ORTH{$GID} = $YEAST_ORTH{$GID}.",".$YEAST{$idx2};}
}
if ($CB{$idx2})
{
if (!($CB_ORTH{$GID})) { $CB_ORTH{$GID} = $CB{$idx2};}
else {$CB_ORTH{$GID} = $CB_ORTH{$GID}.",".$CB{$idx2};}
}
if ($CR{$idx2})
{
if (!($CR_ORTH{$GID})) { $CR_ORTH{$GID} = $CR{$idx2};}
else {$CR_ORTH{$GID} = $CR_ORTH{$GID}.",".$CR{$idx2};}
}
if ($HUMAN{$idx2})
{
if (!($HUMAN_ORTH{$GID})) { $HUMAN_ORTH{$GID} = $HUMAN{$idx2};}
else {$HUMAN_ORTH{$GID} = $HUMAN_ORTH{$GID}.",".$HUMAN{$idx2};}
}
}
}
$st = "SELECT idx1,idx2 from $table_w WHERE idx2=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($IDX) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$idx1 = $array[0];
$idx2 = $array[1];
if ($YEAST{$idx1})
{
if (!($YEAST_ORTH{$GID})) { $YEAST_ORTH{$GID} = $YEAST{$idx1};}
else {$YEAST_ORTH{$GID} = $YEAST_ORTH{$GID}.",".$YEAST{$idx1};}
}
if ($CB{$idx1})
{
if (!($CB_ORTH{$GID})) { $CB_ORTH{$GID} = $CB{$idx1};}
else {$CB_ORTH{$GID} = $CB_ORTH{$GID}.",".$CB{$idx1};}
}
if ($CR{$idx1})
{
if (!($CR_ORTH{$GID})) { $CR_ORTH{$GID} = $CR{$idx1};}
else {$CR_ORTH{$GID} = $CR_ORTH{$GID}.",".$CR{$idx1};}
}
if ($HUMAN{$idx1})
{
if (!($HUMAN_ORTH{$GID})) { $HUMAN_ORTH{$GID} = $HUMAN{$idx1};}
else {$HUMAN_ORTH{$GID} = $HUMAN_ORTH{$GID}.",".$HUMAN{$idx1};}
}
}
}
}
}
#------------------------------------------------------------------#
# READ IN THE INPUT GFF FILE:
print "GENE INTRONS CBORTH CBORTHID CBORTHINTRONS CRORTH CRORTHID CRORTHINTRONS HUMORTH HUMORTHID HUMORTHINTRONS YEASTORTH YEASTORTHID YEASTORTHINTRONS\n";
%SEEN = ();
open(GFF,"$gff") || die "ERROR: cannot open $gff.\n";
while(<GFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
$exon = $temp[8];
# FIND THE GENE NAME:
# eg. ID=CDS:Transcript:C49A1.4b.1;Name=C49A1.4b.1;Parent=Transcript:C49A1.4b.1
@temp = split(/\;/,$exon);
$gene = $temp[1]; # Name=C49A1.4b.1 THIS IS THE TRANSCRIPT NAME.
@temp = split(/\./,$gene);
$gene = $temp[0].".".$temp[1]; # eg. Name=C49A1.4b
@temp = split(/=/,$gene);
$gene = $temp[1]; # eg. C49A1.4b
$gene =~ tr/[a-z]/[A-Z]/;
if (substr($gene,length($gene)-1,1) =~ /[A-Z]/) { chop($gene);} # eg. C49A1.4 THIS IS THE GENE NAME.
# SEE IF THIS GENE HAS C. BRIGGSAE, C. REMANEI, HUMAN AND YEAST ORTHOLOGS:
if ($CB_ORTH{$gene}) { $cb_orth = $CB_ORTH{$gene};}
else { $cb_orth = "none"; }
if ($CR_ORTH{$gene}) { $cr_orth = $CR_ORTH{$gene};}
else { $cr_orth = "none"; }
if ($HUMAN_ORTH{$gene}) { $human_orth = $HUMAN_ORTH{$gene};}
else { $human_orth = "none"; }
if ($YEAST_ORTH{$gene}) { $yeast_orth = $YEAST_ORTH{$gene};}
else { $yeast_orth = "none"; }
if ($VERBOSE == 1) { print "gene $gene cb_orth $cb_orth cr_orth $cr_orth human_orth $human_orth yeast_orth $yeast_orth\n";}
if (!($SEEN{$gene}))
{
$SEEN{$gene} = 1;
# GET THE FAMILY THAT THIS GENE IS FOUND IN:
if ($FAM{$gene})
{
$family = $FAM{$gene};
if ($VERBOSE == 1) { print "C. elegans gene $gene is in family $family\n";}
# GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND C. BRIGGSAE ORTHOLOGS IN THIS FAMILY:
# ALSO GET THE FRACTION OF INTRONS CONSERVED.
if ($cb_orth ne 'none')
{
($cb_orth_id,$cb_orth_introns) = &get_pc_id($gene,$cb_orth,$family);
}
else { $cb_orth_id = 0; $cb_orth_introns = 0;}
# GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND C. REMANEI ORTHOLOGS IN THIS FAMILY:
if ($cr_orth ne 'none')
{
($cr_orth_id,$cr_orth_introns) = &get_pc_id($gene,$cr_orth,$family);
}
else { $cr_orth_id = 0; $cr_orth_introns = 0;}
# GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND HUMAN ORTHOLOGS IN THIS FAMILY:
if ($human_orth ne 'none')
{
($human_orth_id,$human_orth_introns) = &get_pc_id($gene,$human_orth,$family);
}
else { $human_orth_id = 0; $human_orth_introns = 0;}
# GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND YEAST ORTHOLOGS IN THIS FAMILY:
if ($yeast_orth ne 'none')
{
($yeast_orth_id,$yeast_orth_introns) = &get_pc_id($gene,$yeast_orth,$family);
}
else { $yeast_orth_id = 0; $yeast_orth_introns = 0;}
}
else # THE C. ELEGANS GENE IS NOT IN A FAMILY.
{
if ($cb_orth ne 'none' || $cr_orth ne 'none' || $human_orth ne 'none' || $yeast_orth ne 'none')
{
# xxx this is an error in treefam-5 mysql I think.
print STDERR "WARNING: $gene has orthologs but is not in a family\n";
}
$cr_orth_id = 0;
$cr_orth_introns = 0;
$cb_orth_id = 0;
$cb_orth_introns = 0;
$human_orth_id = 0;
$human_orth_introns = 0;
$yeast_orth_id = 0;
$yeast_orth_introns = 0;
}
$introns = &get_num_introns($gene);
if ($gene eq '') { print STDERR "ERROR: gene $gene\n"; exit;}
if ($introns eq '') { print STDERR "ERROR: gene $gene introns $introns\n"; exit;}
if ($cb_orth eq '') { print STDERR "ERROR: gene $gene cb_orth $cb_orth\n"; exit;}
if ($cb_orth_id eq '') { print STDERR "ERROR: gene $gene cb_orth_id $cb_orth_id\n"; exit;}
if ($cb_orth_introns eq '') { print STDERR "ERROR: gene $gene cb_orth_introns $cb_orth_introns cb_orth $cb_orth cb_orth_id $cb_orth_id\n"; exit;}
if ($cr_orth eq '') { print STDERR "ERROR: gene $gene cr_orth $cr_orth\n"; exit;}
if ($cr_orth_id eq '') { print STDERR "ERROR: gene $gene cr_orth_id $cr_orth_id\n"; exit;}
if ($cr_orth_introns eq '') { print STDERR "ERROR: gene $gene cr_orth_introns $cr_orth_introns\n"; exit;}
if ($human_orth eq '') { print STDERR "ERROR: gene $gene human_orth $human_orth\n"; exit;}
if ($human_orth_id eq '') { print STDERR "ERROR: gene $gene human_orth_id $human_orth_id\n"; exit;}
if ($human_orth_introns eq '') { print STDERR "ERROR: gene $gene human_orth_introns $human_orth_introns\n"; exit;}
if ($yeast_orth eq '') { print STDERR "ERROR: gene $gene yeast_orth $yeast_orth\n"; exit;}
if ($yeast_orth_id eq '') { print STDERR "ERROR: gene $gene yeast_orth_id $yeast_orth_id\n"; exit;}
if ($yeast_orth_introns eq '') { print STDERR "ERROR: gene $gene yeast_orth_introns $yeast_orth_introns\n"; exit;}
print "$gene $introns $cb_orth $cb_orth_id $cb_orth_introns $cr_orth $cr_orth_id $cr_orth_introns $human_orth $human_orth_id $human_orth_introns $yeast_orth $yeast_orth_id $yeast_orth_introns\n";
}
}
close(GFF);
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# GET THE NUMBER OF INTRONS IN A C. ELEGANS GENE:
sub get_num_introns
{
my $gene = $_[0];
my $dbh;
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $start = "none";
my $IDS;
my @IDS;
my $i;
my $ID;
my $num_exons;
my @temp;
my $num_introns;
my $max_num_introns = 0;
# FIND THE POSITION OF EACH GENE IN THE TREEFAM MYSQL DATABASE:
$dbh = DBI->connect("dbi:mysql:treefam_5:db.treefam.org:3308", 'anonymous', '') || return;
# GET THE NUMBER OF INTRONS IN EACH TRANSCRIPT OF $gene:
if (!($ID{$gene})) { print STDERR "ERROR: do not know ids for $gene\n"; exit;}
$IDS = $ID{$gene};
@IDS = split(/\,/,$IDS);
for ($i = 0; $i <= $#IDS; $i++)
{
$start = "none";
$ID = $IDS[$i];
# SPECIFY THE TABLE, FOR READING IN THE POSITIONS OF INTRONS:
$table_w = 'map';
$st = "SELECT START from $table_w WHERE ID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($ID) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$start = $array[0]; # eg., 144225,144806,145261,145497,
chop($start); # 144225,144806,145261,145497
}
}
if ($start eq 'none') { print STDERR "ERROR: did not find position of introns in $gene\n"; exit;}
@temp = split(/\,/,$start);
$num_exons = $#temp + 1;
$num_introns = $num_exons - 1;
if ($num_introns > $max_num_introns) { $max_num_introns = $num_introns;}
}
return($max_num_introns);
}
#------------------------------------------------------------------#
# GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND C. BRIGGSAE ORTHOLOGS IN THIS FAMILY:
sub get_pc_id
{
my $gene = $_[0];
my $orths = $_[1];
my $family = $_[2];
my $IDS;
my @IDS;
my $i;
my $ID;
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $CIGAR1 = "none";
my $CIGAR2 = "none";
my @orths;
my $orth;
my $j;
my $the_id1;
my $the_id2;
my $seq1;
my $seq2;
my $pc_id;
my $max_pc_id = 0;
my $intron_cons = 0;
my $max_intron_cons = 0;
my $introns1;
my $introns2;
my %ALN_POS = ();
# GET THE ALIGNMENT FOR GENE $gene IN FAMILY $family:
if (!($ID{$gene})) { print STDERR "ERROR: do not know ids for $gene\n"; exit;}
$IDS = $ID{$gene};
@IDS = split(/\,/,$IDS);
if ($VERBOSE == 1) { print "C. elegans gene $gene has ids $IDS\n";}
for ($i = 0; $i <= $#IDS; $i++)
{
$ID = $IDS[$i];
# GET THE ALIGNMENT FOR THE ID $ID IN FAMILY $family:
$table_w = 'aa_full_align';
$st = "SELECT ID,CIGAR from $table_w WHERE AC=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($family) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
if ($array[0] eq $ID)
{
if ($CIGAR1 ne 'none') { print STDERR "ERROR: already have cigar for $ID $gene $family\n"; exit;}
$CIGAR1 = $array[1];
$the_id1 = $ID;
}
}
}
}
if ($CIGAR1 eq 'none')
{
print STDERR "WARNING: did not find cigar for IDS $IDS gene $gene $family\n";
return($max_pc_id,$max_intron_cons); #xxx probelm with treefam5
#xxx exit;
}
# GET THE SEQUENCE FOR $the_id1:
$table_w = 'aa_seq';
$st = "SELECT SEQ from $table_w WHERE ID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($the_id1) or die "Cannot execute the query: $sth->errstr";
$seq1 = "none";
if ($rv >= 1)
{
while ((@array)= $sth->fetchrow_array)
{
$seq1 = $array[0];
}
}
if ($seq1 eq 'none') { print STDERR "ERROR: seq1 $seq1 for $the_id1 $gene\n"; exit;}
# GET THE POSITIONS OF EACH AMINO ACID IN THE ALIGNMENT:
&read_aln(\%ALN_POS,$the_id1,$CIGAR1,$seq1);
# GET THE POSITIONS OF INTRONS IN $the_id1:
$introns1 = &get_intron_positions($the_id1,\%ALN_POS);
# GET THE ALIGNMENT FOR THE ORTHOLOGS $orths IN FAMILY $family:
@orths = split(/\,/,$orths);
for ($i = 0; $i <= $#orths; $i++)
{
$orth = $orths[$i];
if (!($ID{$orth})) { print STDERR "ERROR: do not know ids for $orth\n"; exit;}
$IDS = $ID{$orth};
@IDS = split(/\,/,$IDS);
$CIGAR2 = "none";
if ($VERBOSE == 1) { print "Ortholog $orth has ids $IDS\n";}
for ($j = 0; $j <= $#IDS; $j++)
{
$ID = $IDS[$j];
if ($VERBOSE == 1) { print "Looking for cigar for $ID (orth $orth)\n";}
# GET THE ALIGNMENT FOR THE ID $ID IN FAMILY $family:
$table_w = 'aa_full_align';
$st = "SELECT ID,CIGAR from $table_w WHERE AC=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($family) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array)= $sth->fetchrow_array)
{
if ($array[0] eq $ID)
{
if ($CIGAR2 ne 'none') { print STDERR "ERROR: already have cigar for $ID $gene $family\n"; exit;}
$CIGAR2 = $array[1];
$the_id2 = $ID;
if ($VERBOSE == 1) { print "Got aln for $the_id2 family $family\n";}
}
}
}
}
if ($CIGAR2 eq 'none')
{
print STDERR "WARNING: did not find cigar for IDS $IDS orth $orth $family (gene $gene)\n";
#xxx exit; #xxx problem with treefam5
return($max_pc_id,$max_intron_cons);
}
# GET THE SEQUENCE FOR $the_id2:
$table_w = 'aa_seq';
$st = "SELECT SEQ from $table_w WHERE ID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($the_id2) or die "Cannot execute the query: $sth->errstr";
$seq2 = "none";
if ($rv >= 1)
{
while ((@array)= $sth->fetchrow_array)
{
$seq2 = $array[0];
}
}
if ($seq2 eq 'none') { print STDERR "ERROR: seq2 $seq2 for $the_id2 $orth\n"; exit;}
# GET THE POSITIONS OF EACH AMINO ACID IN THE ALIGNMENT:
&read_aln(\%ALN_POS,$the_id2,$CIGAR2,$seq2);
# GET THE POSITIONS OF INTRONS IN $the_id2:
$introns2 = &get_intron_positions($the_id2,\%ALN_POS);
# GET THE PERCENT IDENTITY BETWEEN $gene AND $orth:
$pc_id = &get_pc_id_for_pair($CIGAR1,$CIGAR2,$seq1,$seq2);
if ($pc_id > $max_pc_id) { $max_pc_id = $pc_id;}
# GET THE FRACTION OF INTRONS THAT ARE CONSERVED BETWEEN $gene AND $orth:
$intron_cons = &get_fraction_cons_introns($CIGAR1,$CIGAR2,$seq1,$seq2,$introns1,$introns2);
if ($intron_cons > $max_intron_cons) { $max_intron_cons = $intron_cons;}
}
return($max_pc_id,$max_intron_cons);
}
#------------------------------------------------------------------#
# GET THE FRACTION OF INTRONS THAT ARE CONSERVED BETWEEN $gene AND $orth:
sub get_fraction_cons_introns
{
my $CIGAR1 = $_[0];
my $CIGAR2 = $_[1];
my $seq1 = $_[2];
my $seq2 = $_[3];
my $introns1 = $_[4];
my $introns2 = $_[5];
my $intron_cons = 0;
my $num_introns1;
my @temp;
my @introns1;
my @introns2;
my %INTRON1 = ();
my $i;
my $pos;
my $num_cons;
# FIND OUT HOW MANY INTRONS ARE IN $introns1:
@introns1 = split(/\,/,$introns1);
$num_introns1 = $#introns1 + 1;
@introns2 = split(/\,/,$introns2);
# SEE HOW MANY OF THE INTRONS IN $introns1 ARE IN $introns2:
for ($i = 0; $i <= $#introns1; $i++)
{
@temp = split(/_/,$introns1[$i]); # eg. C37A5.9.1_200_0
$pos = $temp[1];
$INTRON1{$pos} = 1;
}
$num_cons = 0;
for ($i = 0; $i <= $#introns2; $i++)
{
@temp = split(/_/,$introns2[$i]);
$pos = $temp[1];
if ($INTRON1{$pos}) { $num_cons++;}
}
if ($num_introns1 == 0)
{
if ($num_cons > 0) { print STDERR "ERROR: num_introns1 $num_introns1 num_cons $num_cons\n"; exit;}
$intron_cons = 0;
}
else
{
$intron_cons = ($num_cons*100)/($num_introns1);
}
return($intron_cons);
}
#------------------------------------------------------------------#
# GET THE POSITIONS OF EACH AMINO ACID IN THE ALIGNMENT:
sub read_aln
{
my $ALN_POS = $_[0];
my $id = $_[1];
my $CIGAR = $_[2];
my $seq = $_[3];
my $num;
my $aln;
my $offset;
my $i;
my $char;
my $j;
my $char2;
my $prot_pos;
my $aln_pos;
my $length;
# MAKE THE ALIGNMENT FOR $id:
$num = "";
$aln = "";
$offset = 0;
for ($i = 1; $i <= length($CIGAR); $i++)
{
$char = substr($CIGAR,$i-1,1);
if ($char =~ /[0-9]/) { $num = $num.$char;}
else
{
if ($char ne 'M' & $char ne 'D') { print STDERR "ERROR: char $char\n"; exit;}
if ($char eq 'D')
{
for ($j = 1; $j <= $num; $j++) { $aln = $aln."-";}
}
elsif ($char eq 'M')
{
for ($j = 1; $j <= $num; $j++)
{
$offset++;
if ($offset > length($seq))
{
print STDERR "ERROR: offset $offset seq $seq\n";
exit;
}
$char2 = substr($seq,$offset-1,1);
$aln = $aln.$char2;
}
}
$num = "";
}
}
# NOW RECORD THE POSITIONS IN THE ALIGNMENT:
$prot_pos = 0;
$aln_pos = 0;
$length = length($aln);
for ($i = 0; $i < $length; $i++)
{
$char = substr($aln,$i,1);
$aln_pos++;
if ($char ne '-')
{
$prot_pos++;
if ($ALN_POS->{$id."_".$prot_pos}) { print STDERR "ERROR: already have aln pos for $id $prot_pos.\n"; exit;}
$ALN_POS->{$id."_".$prot_pos} = $aln_pos;
}
}
}
#------------------------------------------------------------------#
# GET THE POSITIONS OF INTRONS IN A GENE:
sub get_intron_positions
{
my $id = $_[0];
my $ALN_POS = $_[1];
my $dbh;
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $strand;
my $start;
my $stop;
my $introns;
my $rc;
my $cstart;
my $cstop;
# FIND THE POSITION OF EACH GENE IN THE TREEFAM MYSQL DATABASE:
$dbh = DBI->connect("dbi:mysql:treefam_5:db.treefam.org:3308", 'anonymous', '') || return;
# SPECIFY THE TABLE, FOR READING IN THE POSITIONS OF INTRONS:
$table_w = 'map';
$st = "SELECT STRAND,START,STOP,C_START,C_STOP from $table_w WHERE ID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($id) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$strand = $array[0]; # eg., +
$start = $array[1]; # eg., 144225,144806,145261,145497,
$stop = $array[2]; # eg., 144318,145216,145444,145804,
$cstart = $array[3];
$cstop = $array[4];
$start = &remove_noncoding($start,$cstart,$cstop,"start");
$stop = &remove_noncoding($stop,$cstart,$cstop,"stop");
# FIND THE INTRONS IN THIS GENE:
$introns = &infer_intron_positions($strand,$start,$stop,$id,$ALN_POS);
}
}
return($introns);
}
#------------------------------------------------------------------#
# INFER THE INTRON POSITIONS IN A GENE:
sub infer_intron_positions
{
my $strand = $_[0];
my $start = $_[1]; # 144225,144806,145261,145497,
my $stop = $_[2]; # 144318,145216,145444,145804,
my $gene = $_[3];
my $ALN_POS = $_[4];
my @start = split(/\,/,$start);
my @stop = split(/\,/,$stop);
my $i;
my $exon_start;
my $exon_stop;
my $coding_length;
my $hangover;
my $intron_phase;
my $prev_aa_pos = 0;
my @temp;
my $aln_pos;
my $hmm_pos;
my $intron;
my $introns = "";
if ($#start != $#stop)
{
print STDERR "ERROR: gene $gene : start $#start ($start) stop $#stop ($stop).\n";
exit;
}
if ($VERBOSE == 1) { print "$gene-------------------------------------------------------\n";}
if ($VERBOSE == 1) { print "gene $gene : start $start stop $stop\n";}
if ($strand eq '+')
{
$coding_length = 0;
for ($i = 0; $i <= $#start; $i++)
{
# FIND THE COORDINATES OF EXON $i:
$exon_start = $start[$i] + 1; # +1 TO CORRECT FOR THE WAY EXONS ARE STORED IN THE TREEFAM MYSQL DB.
$exon_stop = $stop[$i];
# CALCULATE THE CODING LENGTH SO FAR:
$coding_length = $coding_length + ($exon_stop - $exon_start + 1);
# WORK OUT THE POSITION OF THE PREVIOUS AMINO ACID:
$prev_aa_pos = $coding_length/3;
@temp = split(/\./,$prev_aa_pos);
$prev_aa_pos = $temp[0];
# LEFT-OVER BASES AFTER THE CODING LENGTH SO FAR HAS BEEN TRANSLATED INTO CODONS:
$hangover = $coding_length % 3;
if ($hangover == 0) { $intron_phase = 0;}
elsif ($hangover == 1) { $intron_phase = 1;} # IF 1 LEFT OVER, THE INTRON LIES AFTER THE 1ST BASE OF CODON (PHASE 1).
elsif ($hangover == 2) { $intron_phase = 2;} # IF 2 LEFT OVER, THE INTRON LIES AFTER THE 2ND BASE OF CODON (PHASE 2).
if ($i != $#start)
{
# FIND THE POSITION OF AMINO ACID $prev_aa_pos IN THE ALIGNMENT:
if (!($ALN_POS->{$gene."_".$prev_aa_pos})) { print STDERR "ERROR: do not know aln pos of $gene $prev_aa_pos.\n"; exit;}
$aln_pos = $ALN_POS->{$gene."_".$prev_aa_pos};
$intron = $gene."_".$aln_pos."_".$intron_phase;
$introns = $introns.",".$intron;
if ($VERBOSE == 1) { print "strand $strand : $exon_start $exon_stop and aa $prev_aa_pos aln_pos $aln_pos intron_phase $intron_phase\n";}
}
}
}
elsif ($strand eq '-')
{
$coding_length = 0;
for ($i = $#start; $i >= 0; $i--)
{
# FIND THE COORDINATES OF EXON $i:
$exon_start = $start[$i] + 1; # +1 TO CORRECT FOR THE WAY EXONS ARE STORED IN THE TREEFAM MYSQL DB.
$exon_stop = $stop[$i];
# CALCULATE THE CODING LENGTH SO FAR:
$coding_length = $coding_length + ($exon_stop - $exon_start + 1);
# WORK OUT THE POSITION OF THE PREVIOUS AMINO ACID:
$prev_aa_pos = $coding_length/3;
@temp = split(/\./,$prev_aa_pos);
$prev_aa_pos = $temp[0];
# LEFT-OVER BASES AFTER THE CODING LENGTH SO FAR HAS BEEN TRANSLATED INTO CODONS:
$hangover = $coding_length % 3;
if ($hangover == 0) { $intron_phase = 0;}
elsif ($hangover == 1) { $intron_phase = 1;} # IF 1 LEFT OVER, THE INTRON LIES AFTER THE 1ST BASE OF CODON (PHASE 1).
elsif ($hangover == 2) { $intron_phase = 2;} # IF 2 LEFT OVER, THE INTRON LIES AFTER THE 2ND BASE OF CODON (PHASE 2).
if ($i != 0)
{
# FIND THE POSITION OF AMINO ACID $prev_aa_pos IN THE ALIGNMENT:
if (!($ALN_POS->{$gene."_".$prev_aa_pos})) { print STDERR "ERROR: do not know aln pos of $gene $prev_aa_pos.\n"; exit;}
$aln_pos = $ALN_POS->{$gene."_".$prev_aa_pos};
$intron = $gene."_".$aln_pos."_".$intron_phase;
$introns = $introns.",".$intron;
if ($VERBOSE == 1) { print "strand $strand : $exon_start $exon_stop and aa $prev_aa_pos aln_pos $aln_pos intron_phase $intron_phase\n";}
}
}
}
else
{
print STDERR "ERROR: strand $strand.\n"; exit;
}
if ($introns ne '') { $introns = substr($introns,1,length($introns)-1);}
return($introns);
}
#------------------------------------------------------------------#
# SUBROUTINE TO REMOVE NON-CODING REGIONS FROM THE EXONS:
sub remove_noncoding
{
my $exons = $_[0];
my $cstart = $_[1];
my $cstop = $_[2];
my $type = $_[3];
my $new_exons = "";
chop($exons);
my @exons = split(/\,/,$exons);
my $i;
my $seen_start = 0;
my $seen_end = 0;
for ($i = 0; $i <= $#exons; $i++)
{
if ($exons[$i] >= $cstart && $exons[$i] <= $cstop)
{
if ($i == 0) { $seen_start = 1;}
elsif ($i == $#exons) { $seen_end = 1;}
$new_exons = $new_exons.",".$exons[$i];
if ($exons[$i] == $cstart && $seen_start == 0) { $seen_start = 1;}
if ($exons[$i] == $cstop && $seen_end == 0) { $seen_end = 1;}
}
elsif ($exons[$i] <= $cstart)
{
# ignore this exon.
}
elsif ($exons[$i] >= $cstop)
{
# ignore this exon.
}
}
if ($new_exons ne '')
{
$new_exons = substr($new_exons,1,length($new_exons)-1);
if ($seen_start == 0 && $type eq 'start') { $new_exons = $cstart.",".$new_exons;}
if ($seen_end == 0 && $type eq 'stop') { $new_exons = $new_exons.",".$cstop; }
}
else
{
# THERE IS JUST ONE EXON IN THE GENE:
if ($seen_start == 0 && $type eq 'start') { $new_exons = $cstart;}
if ($seen_end == 0 && $type eq 'stop') { $new_exons = $cstop; }
}
if (substr($new_exons,0,1) eq ',') { $new_exons = substr($new_exons,1,length($new_exons)-1);}
if (substr($new_exons,length($new_exons)-1,1) eq ',') { chop($new_exons); }
return($new_exons);
}
#------------------------------------------------------------------#
# GET THE PERCENT IDENTITY BETWEEN $gene AND $orth:
sub get_pc_id_for_pair
{
my $CIGAR1 = $_[0];
my $CIGAR2 = $_[1];
my $seq1 = $_[2];
my $seq2 = $_[3];
my $aln1;
my $aln2;
my $i;
my $num;
my $char;
my $j;
my $offset;
my $char2;
my $id;
my $nongap;
# MAKE THE ALIGNMENT FOR $gene:
$num = "";
$aln1 = "";
$offset = 0;
for ($i = 1; $i <= length($CIGAR1); $i++)
{
$char = substr($CIGAR1,$i-1,1);
if ($char =~ /[0-9]/) { $num = $num.$char;}
else
{
if ($char ne 'M' & $char ne 'D') { print STDERR "ERROR: char $char\n"; exit;}
if ($char eq 'D')
{
for ($j = 1; $j <= $num; $j++) { $aln1 = $aln1."-";}
}
elsif ($char eq 'M')
{
for ($j = 1; $j <= $num; $j++)
{
$offset++;
if ($offset > length($seq1))
{
print STDERR "ERROR: offset $offset seq1 $seq1\n";
exit;
}
$char2 = substr($seq1,$offset-1,1);
$aln1 = $aln1.$char2;
}
}
$num = "";
}
}
# MAKE THE ALIGNMENT FOR $orth:
$num = "";
$aln2 = "";
$offset = 0;
for ($i = 1; $i <= length($CIGAR2); $i++)
{
$char = substr($CIGAR2,$i-1,1);
if ($char =~ /[0-9]/) { $num = $num.$char;}
else
{
if ($char ne 'M' & $char ne 'D') { print STDERR "ERROR: char $char\n"; exit;}
if ($char eq 'D')
{
for ($j = 1; $j <= $num; $j++) { $aln2 = $aln2."-";}
}
elsif ($char eq 'M')
{
for ($j = 1; $j <= $num; $j++)
{
$offset++;
if ($offset > length($seq2))
{
print STDERR "ERROR: offset $offset seq2 $seq2\n";
exit;
}
$char2 = substr($seq2,$offset-1,1);
$aln2 = $aln2.$char2;
}
}
$num = "";
}
}
# FIND THE PERCENT IDENTITY BETWEEN THE SEQUENCES:
if (length($aln1) != length($aln2))
{
print STDERR "ERROR: length of aln for gene is not same as length for orth\n";
exit;
}
$nongap = 0;
$id = 0;
for ($i = 1; $i <= length($aln1); $i++)
{
$char1 = substr($aln1,$i-1,1);
$char2 = substr($aln2,$i-1,1);
if ($char1 ne '-' && $char2 ne '-')
{
$nongap++;
if ($char1 eq $char2) { $id++;}
}
}
if ($nongap >= 0)
{
$id = $id*100/($nongap);
}
else
{
$id = 0;
}
return($id);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment