Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 16:07
Show Gist options
  • Save avrilcoghlan/5065640 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065640 to your computer and use it in GitHub Desktop.
Perl script that, given a gff file of Caenorhabditis elegans genes, retrieves their orthologs in C. briggsae, human and yeast from the TreeFam database, and finds the percent identity between each C. elegans gene and each of its orthologs.
#!/usr/local/bin/perl
#
# Perl script find_pc_id_to_treefam_ortholog.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 4-Dec-07.
#
# This perls scripts finds the %id to the TreeFam ortholog in C. briggsae, human, yeast
# each gene in the ngasp benchmark gene set.
#
# The command-line format is:
# % perl <find_pc_id_to_treefam_ortholog.pl> gff
# where gff is the input benchmark gff file.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 1)
{
print "Usage of find_pc_id_to_treefam_ortholog.pl\n\n";
print "perl find_pc_id_to_treefam_ortholog.pl <gff>\n";
print "where <gff> is the input benchmark gff file.\n";
print "For example, >perl find_pc_id_to_treefam_ortholog.pl phase1_confirmed_isoforms_A.gff.1\n";
exit;
}
# READ IN MY PERL MODULES:
BEGIN {
unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
}
use Avril_modules;
use DBI;
# FIND THE NAME OF THE INPUT GFF FILE:
$gff = $ARGV[0];
$VERBOSE = 0;
#------------------------------------------------------------------#
# READ IN THE INPUT GFF FILE:
%TAKE = ();
open(GFF,"$gff") || die "ERROR: cannot open $gff.\n";
while(<GFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
$exon = $temp[8];
# FIND THE GENE NAME:
# eg. ID=CDS:Transcript:C49A1.4b.1;Name=C49A1.4b.1;Parent=Transcript:C49A1.4b.1
@temp = split(/\;/,$exon);
$gene = $temp[1]; # Name=C49A1.4b.1 THIS IS THE TRANSCRIPT NAME.
@temp = split(/\./,$gene);
$gene = $temp[0].".".$temp[1]; # eg. Name=C49A1.4b
@temp = split(/=/,$gene);
$gene = $temp[1]; # eg. C49A1.4b
$gene =~ tr/[a-z]/[A-Z]/;
if (substr($gene,length($gene)-1,1) =~ /[A-Z]/) { chop($gene);} # eg. C49A1.4 THIS IS THE GENE NAME.
$TAKE{$gene} = 1;
}
close(GFF);
#------------------------------------------------------------------#
# FIND THE TREEFAM ID FOR ALL THE GENES FROM C. ELEGANS THAT I AM INTERESTED IN:
# HASH TABLES TO REMEMBER THE SPECIES OF GENES:
$dbh = DBI->connect("dbi:mysql:treefam_5:db.treefam.org:3308", 'anonymous', '') || return;
# SPECIFY THE TABLE:
$table_w = 'genes';
# THE FIRST COLUMN IN THIS TABLE IS AN IDENTIFIER (IDX),
# ANOTHER COLUMN IS THE TRANSCRIPT NAME, AND THE LAST COLUMN
# IS THE TAXONOMY ID, eg. 1 ENSANGT00000032162.1 7165
$st = "SELECT IDX, TAX_ID, GID, ID from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
%IDX = ();
%ID = ();
%YEAST = ();
%CB = ();
%CR = ();
%HUMAN = ();
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array) {
$IDX = $array[0]; # eg., 1
$TAXID = $array[1]; # eg 7165
$GID = $array[2];
$ID = $array[3];
# CHECK IF THIS IS A C. ELEGANS GENE THAT WE WANT TO TAKE:
if ($TAKE{$GID})
{
if (!($IDX{$GID})) { $IDX{$GID} = $IDX;}
else {$IDX{$GID} = $IDX{$GID}.",".$IDX;}
if (!($ID{$GID})) { $ID{$GID} = $ID;}
else {$ID{$GID} = $ID{$GID}.",".$ID;}
}
# RECORD IF IT IS A YEAST GENE:
if ($TAXID eq '4932')
{
$YEAST{$IDX} = $GID;
if (!($ID{$GID})) { $ID{$GID} = $ID;}
else {$ID{$GID} = $ID{$GID}.",".$ID;}
}
# RECORD IF IT IS A C. BRIGGSAE GENE:
if ($TAXID eq '6238')
{
$CB{$IDX} = $GID;
if (!($ID{$GID})) { $ID{$GID} = $ID;}
else {$ID{$GID} = $ID{$GID}.",".$ID;}
}
# RECORD IF IT IS A C. REMANEI GENE:
if ($TAXID eq '31234')
{
$CR{$IDX} = $GID;
if (!($ID{$GID})) { $ID{$GID} = $ID;}
else {$ID{$GID} = $ID{$GID}.",".$ID;}
}
# RECORD IF IT IS A HUMAN GENE:
if ($TAXID eq '9606')
{
$HUMAN{$IDX} = $GID;
if (!($ID{$GID})) { $ID{$GID} = $ID;}
else {$ID{$GID} = $ID{$GID}.",".$ID;}
}
}
}
# NOW FIND THE FAMILIES FOR THE C. ELEGANS GENES OF INTEREST:
%FAM = ();
%SEEN = ();
foreach $GID (keys %TAKE)
{
$IDS = $ID{$GID};
@IDS = split(/\,/,$IDS);
for ($i = 0; $i <= $#IDS; $i++)
{
$ID = $IDS[$i];
# GET THE FAMILY FOR THE ID $ID:
$table_w = 'fam_genes';
$st = "SELECT ID,AC,FLAG,FAM_TYPE from $table_w WHERE ID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($ID) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[1];
$FLAG = $array[2];
$FAM_TYPE = $array[3];
if ($FLAG eq 'FULL' & ($FAM_TYPE eq 'A' || $FAM_TYPE eq 'B'))
{
if (!($SEEN{$GID."_".$AC}))
{
if (!($FAM{$GID})) { $FAM{$GID} = $AC; }
else { print STDERR "$GID is in $FAM{$GID} and $AC\n"; exit;}
}
}
}
}
}
}
# NOW FIND ALL THE ORTHOLOGS FOR THE C. ELEGANS GENES OF INTEREST:
%YEAST_ORTH = ();
%CB_ORTH = ();
%CR_ORTH = ();
%HUMAN_ORTH = ();
foreach $GID (keys %IDX)
{
$IDXS = $IDX{$GID};
@IDXS = split(/\,/,$IDXS);
for ($i = 0; $i <= $#IDXS; $i++)
{
$IDX = $IDXS[$i];
# GET ALL THE ORTHOLGOS FOR $IDX:
$table_w = 'ortholog';
$st = "SELECT idx1,idx2 from $table_w WHERE idx1=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($IDX) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$idx1 = $array[0];
$idx2 = $array[1];
if ($YEAST{$idx2})
{
if (!($YEAST_ORTH{$GID})) { $YEAST_ORTH{$GID} = $YEAST{$idx2};}
else {$YEAST_ORTH{$GID} = $YEAST_ORTH{$GID}.",".$YEAST{$idx2};}
}
if ($CB{$idx2})
{
if (!($CB_ORTH{$GID})) { $CB_ORTH{$GID} = $CB{$idx2};}
else {$CB_ORTH{$GID} = $CB_ORTH{$GID}.",".$CB{$idx2};}
}
if ($CR{$idx2})
{
if (!($CR_ORTH{$GID})) { $CR_ORTH{$GID} = $CR{$idx2};}
else {$CR_ORTH{$GID} = $CR_ORTH{$GID}.",".$CR{$idx2};}
}
if ($HUMAN{$idx2})
{
if (!($HUMAN_ORTH{$GID})) { $HUMAN_ORTH{$GID} = $HUMAN{$idx2};}
else {$HUMAN_ORTH{$GID} = $HUMAN_ORTH{$GID}.",".$HUMAN{$idx2};}
}
}
}
$st = "SELECT idx1,idx2 from $table_w WHERE idx2=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($IDX) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$idx1 = $array[0];
$idx2 = $array[1];
if ($YEAST{$idx1})
{
if (!($YEAST_ORTH{$GID})) { $YEAST_ORTH{$GID} = $YEAST{$idx1};}
else {$YEAST_ORTH{$GID} = $YEAST_ORTH{$GID}.",".$YEAST{$idx1};}
}
if ($CB{$idx1})
{
if (!($CB_ORTH{$GID})) { $CB_ORTH{$GID} = $CB{$idx1};}
else {$CB_ORTH{$GID} = $CB_ORTH{$GID}.",".$CB{$idx1};}
}
if ($CR{$idx1})
{
if (!($CR_ORTH{$GID})) { $CR_ORTH{$GID} = $CR{$idx1};}
else {$CR_ORTH{$GID} = $CR_ORTH{$GID}.",".$CR{$idx1};}
}
if ($HUMAN{$idx1})
{
if (!($HUMAN_ORTH{$GID})) { $HUMAN_ORTH{$GID} = $HUMAN{$idx1};}
else {$HUMAN_ORTH{$GID} = $HUMAN_ORTH{$GID}.",".$HUMAN{$idx1};}
}
}
}
}
}
#------------------------------------------------------------------#
# READ IN THE INPUT GFF FILE:
print "GENE CBORTH CRORTH HUMORTH YEASTORTH\n";
%SEEN = ();
open(GFF,"$gff") || die "ERROR: cannot open $gff.\n";
while(<GFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
$exon = $temp[8];
# FIND THE GENE NAME:
# eg. ID=CDS:Transcript:C49A1.4b.1;Name=C49A1.4b.1;Parent=Transcript:C49A1.4b.1
@temp = split(/\;/,$exon);
$gene = $temp[1]; # Name=C49A1.4b.1 THIS IS THE TRANSCRIPT NAME.
@temp = split(/\./,$gene);
$gene = $temp[0].".".$temp[1]; # eg. Name=C49A1.4b
@temp = split(/=/,$gene);
$gene = $temp[1]; # eg. C49A1.4b
$gene =~ tr/[a-z]/[A-Z]/;
if (substr($gene,length($gene)-1,1) =~ /[A-Z]/) { chop($gene);} # eg. C49A1.4 THIS IS THE GENE NAME.
# SEE IF THIS GENE HAS C. BRIGGSAE, C. REMANEI, HUMAN AND YEAST ORTHOLOGS:
if ($CB_ORTH{$gene}) { $cb_orth = $CB_ORTH{$gene};}
else { $cb_orth = "none"; }
if ($CR_ORTH{$gene}) { $cr_orth = $CR_ORTH{$gene};}
else { $cr_orth = "none"; }
if ($HUMAN_ORTH{$gene}) { $human_orth = $HUMAN_ORTH{$gene};}
else { $human_orth = "none"; }
if ($YEAST_ORTH{$gene}) { $yeast_orth = $YEAST_ORTH{$gene};}
else { $yeast_orth = "none"; }
if ($VERBOSE == 1) { print "gene $gene cb_orth $cb_orth cr_orth $cr_orth human_orth $human_orth yeast_orth $yeast_orth\n";}
if (!($SEEN{$gene}))
{
$SEEN{$gene} = 1;
# GET THE FAMILY THAT THIS GENE IS FOUND IN:
if ($FAM{$gene})
{
$family = $FAM{$gene};
if ($VERBOSE == 1) { print "C. elegans gene $gene is in family $family\n";}
# GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND C. BRIGGSAE ORTHOLOGS IN THIS FAMILY:
if ($cb_orth ne 'none')
{
$cb_orth_id = &get_pc_id($gene,$cb_orth,$family);
}
else { $cb_orth_id = 0;}
# GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND C. REMANEI ORTHOLOGS IN THIS FAMILY:
if ($cr_orth ne 'none')
{
$cr_orth_id = &get_pc_id($gene,$cr_orth,$family);
}
else { $cr_orth_id = 0;}
# GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND HUMAN ORTHOLOGS IN THIS FAMILY:
if ($human_orth ne 'none')
{
$human_orth_id = &get_pc_id($gene,$human_orth,$family);
}
else { $human_orth_id = 0;}
# GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND YEAST ORTHOLOGS IN THIS FAMILY:
if ($yeast_orth ne 'none')
{
$yeast_orth_id = &get_pc_id($gene,$yeast_orth,$family);
}
else { $yeast_orth_id = 0;}
}
else # THE C. ELEGANS GENE IS NOT IN A FAMILY.
{
if ($cb_orth ne 'none' || $cr_orth ne 'none' || $human_orth ne 'none' || $yeast_orth ne 'none')
{
# xxx this is an error in treefam-5 mysql I think.
print STDERR "WARNING: $gene has orthologs but is not in a family\n";
}
$cr_orth_id = 0;
$cb_orth_id = 0;
$human_orth_id = 0;
$yeast_orth_id = 0;
}
print "$gene $cb_orth $cb_orth_id $cr_orth $cr_orth_id $human_orth $human_orth_id $yeast_orth $yeast_orth_id\n";
}
}
close(GFF);
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# GET THE PERCENT IDENTITY BETWEEN THE C. ELEGANS AND C. BRIGGSAE ORTHOLOGS IN THIS FAMILY:
sub get_pc_id
{
my $gene = $_[0];
my $orths = $_[1];
my $family = $_[2];
my $IDS;
my @IDS;
my $i;
my $ID;
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $CIGAR1 = "none";
my $CIGAR2 = "none";
my @orths;
my $orth;
my $j;
my $the_id1;
my $the_id2;
my $seq1;
my $seq2;
my $pc_id;
my $max_pc_id = 0;
# GET THE ALIGNMENT FOR GENE $gene IN FAMILY $family:
if (!($ID{$gene})) { print STDERR "ERROR: do not know ids for $gene\n"; exit;}
$IDS = $ID{$gene};
@IDS = split(/\,/,$IDS);
if ($VERBOSE == 1) { print "C. elegans gene $gene has ids $IDS\n";}
for ($i = 0; $i <= $#IDS; $i++)
{
$ID = $IDS[$i];
# GET THE ALIGNMENT FOR THE ID $ID IN FAMILY $family:
$table_w = 'aa_full_align';
$st = "SELECT ID,CIGAR from $table_w WHERE AC=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($family) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
if ($array[0] eq $ID)
{
if ($CIGAR1 ne 'none') { print STDERR "ERROR: already have cigar for $ID $gene $family\n"; exit;}
$CIGAR1 = $array[1];
$the_id1 = $ID;
}
}
}
}
if ($CIGAR1 eq 'none')
{
print STDERR "WARNING: did not find cigar for IDS $IDS gene $gene $family\n";
return($max_pc_id); #xxx probelm with treefam5
#xxx exit;
}
# GET THE SEQUENCE FOR $the_id1:
$table_w = 'aa_seq';
$st = "SELECT SEQ from $table_w WHERE ID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($the_id1) or die "Cannot execute the query: $sth->errstr";
$seq1 = "none";
if ($rv >= 1)
{
while ((@array)= $sth->fetchrow_array)
{
$seq1 = $array[0];
}
}
if ($seq1 eq 'none') { print STDERR "ERROR: seq1 $seq1 for $the_id1 $gene\n"; exit;}
# GET THE ALIGNMENT FOR THE ORTHOLOGS $orths IN FAMILY $family:
@orths = split(/\,/,$orths);
for ($i = 0; $i <= $#orths; $i++)
{
$orth = $orths[$i];
if (!($ID{$orth})) { print STDERR "ERROR: do not know ids for $orth\n"; exit;}
$IDS = $ID{$orth};
@IDS = split(/\,/,$IDS);
$CIGAR2 = "none";
if ($VERBOSE == 1) { print "Ortholog $orth has ids $IDS\n";}
for ($j = 0; $j <= $#IDS; $j++)
{
$ID = $IDS[$j];
if ($VERBOSE == 1) { print "Looking for cigar for $ID (orth $orth)\n";}
# GET THE ALIGNMENT FOR THE ID $ID IN FAMILY $family:
$table_w = 'aa_full_align';
$st = "SELECT ID,CIGAR from $table_w WHERE AC=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($family) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array)= $sth->fetchrow_array)
{
if ($array[0] eq $ID)
{
if ($CIGAR2 ne 'none') { print STDERR "ERROR: already have cigar for $ID $gene $family\n"; exit;}
$CIGAR2 = $array[1];
$the_id2 = $ID;
if ($VERBOSE == 1) { print "Got aln for $the_id2 family $family\n";}
}
}
}
}
if ($CIGAR2 eq 'none')
{
print STDERR "WARNING: did not find cigar for IDS $IDS orth $orth $family (gene $gene)\n";
#xxx exit; #xxx problem with treefam5
return($max_pc_id);
}
# GET THE SEQUENCE FOR $the_id2:
$table_w = 'aa_seq';
$st = "SELECT SEQ from $table_w WHERE ID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($the_id2) or die "Cannot execute the query: $sth->errstr";
$seq2 = "none";
if ($rv >= 1)
{
while ((@array)= $sth->fetchrow_array)
{
$seq2 = $array[0];
}
}
if ($seq2 eq 'none') { print STDERR "ERROR: seq2 $seq2 for $the_id2 $orth\n"; exit;}
# GET THE PERCENT IDENTITY BETWEEN $gene AND $orth:
$pc_id = &get_pc_id_for_pair($CIGAR1,$CIGAR2,$seq1,$seq2);
if ($pc_id > $max_pc_id) { $max_pc_id = $pc_id;}
}
return($max_pc_id);
}
#------------------------------------------------------------------#
# GET THE PERCENT IDENTITY BETWEEN $gene AND $orth:
sub get_pc_id_for_pair
{
my $CIGAR1 = $_[0];
my $CIGAR2 = $_[1];
my $seq1 = $_[2];
my $seq2 = $_[3];
my $aln1;
my $aln2;
my $i;
my $num;
my $char;
my $j;
my $offset;
my $char2;
my $id;
my $nongap;
# MAKE THE ALIGNMENT FOR $gene:
$num = "";
$aln1 = "";
$offset = 0;
for ($i = 1; $i <= length($CIGAR1); $i++)
{
$char = substr($CIGAR1,$i-1,1);
if ($char =~ /[0-9]/) { $num = $num.$char;}
else
{
if ($char ne 'M' & $char ne 'D') { print STDERR "ERROR: char $char\n"; exit;}
if ($char eq 'D')
{
for ($j = 1; $j <= $num; $j++) { $aln1 = $aln1."-";}
}
elsif ($char eq 'M')
{
for ($j = 1; $j <= $num; $j++)
{
$offset++;
if ($offset > length($seq1))
{
print STDERR "ERROR: offset $offset seq1 $seq1\n";
exit;
}
$char2 = substr($seq1,$offset-1,1);
$aln1 = $aln1.$char2;
}
}
$num = "";
}
}
# MAKE THE ALIGNMENT FOR $orth:
$num = "";
$aln2 = "";
$offset = 0;
for ($i = 1; $i <= length($CIGAR2); $i++)
{
$char = substr($CIGAR2,$i-1,1);
if ($char =~ /[0-9]/) { $num = $num.$char;}
else
{
if ($char ne 'M' & $char ne 'D') { print STDERR "ERROR: char $char\n"; exit;}
if ($char eq 'D')
{
for ($j = 1; $j <= $num; $j++) { $aln2 = $aln2."-";}
}
elsif ($char eq 'M')
{
for ($j = 1; $j <= $num; $j++)
{
$offset++;
if ($offset > length($seq2))
{
print STDERR "ERROR: offset $offset seq2 $seq2\n";
exit;
}
$char2 = substr($seq2,$offset-1,1);
$aln2 = $aln2.$char2;
}
}
$num = "";
}
}
# FIND THE PERCENT IDENTITY BETWEEN THE SEQUENCES:
if (length($aln1) != length($aln2))
{
print STDERR "ERROR: length of aln for gene is not same as length for orth\n";
exit;
}
$nongap = 0;
$id = 0;
for ($i = 1; $i <= length($aln1); $i++)
{
$char1 = substr($aln1,$i-1,1);
$char2 = substr($aln2,$i-1,1);
if ($char1 ne '-' && $char2 ne '-')
{
$nongap++;
if ($char1 eq $char2) { $id++;}
}
}
if ($nongap >= 0)
{
$id = $id*100/($nongap);
}
else
{
$id = 0;
}
return($id);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment