Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 15:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5065479 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065479 to your computer and use it in GitHub Desktop.
Perl script that, given a gff file of Caenorhabditis elegans genes, finds their C. briggsae, human and yeast orthologs from TreeFam.
#!/usr/local/bin/perl
#
# Perl script check_if_have_treefam_ortholog.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 4-Dec-07.
#
# This perls scripts finds the TreeFam ortholog in C. briggsae, human, yeast
# each gene in the ngasp benchmark gene set.
#
# The command-line format is:
# % perl <check_if_have_treefam_ortholog.pl> gff
# where gff is the input benchmark gff file.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 1)
{
print "Usage of check_if_have_treefam_ortholog.pl\n\n";
print "perl check_if_have_treefam_ortholog.pl <gff>\n";
print "where <gff> is the input benchmark gff file.\n";
print "For example, >perl check_if_have_treefam_ortholog.pl phase1_confirmed_isoforms_A.gff.1\n";
exit;
}
# READ IN MY PERL MODULES:
BEGIN {
unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
}
use Avril_modules;
use DBI;
# FIND THE NAME OF THE INPUT GFF FILE:
$gff = $ARGV[0];
#------------------------------------------------------------------#
# READ IN THE INPUT GFF FILE:
%TAKE = ();
open(GFF,"$gff") || die "ERROR: cannot open $gff.\n";
while(<GFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
$exon = $temp[8];
# FIND THE GENE NAME:
# eg. ID=CDS:Transcript:C49A1.4b.1;Name=C49A1.4b.1;Parent=Transcript:C49A1.4b.1
@temp = split(/\;/,$exon);
$gene = $temp[1]; # Name=C49A1.4b.1 THIS IS THE TRANSCRIPT NAME.
@temp = split(/\./,$gene);
$gene = $temp[0].".".$temp[1]; # eg. Name=C49A1.4b
@temp = split(/=/,$gene);
$gene = $temp[1]; # eg. C49A1.4b
$gene =~ tr/[a-z]/[A-Z]/;
if (substr($gene,length($gene)-1,1) =~ /[A-Z]/) { chop($gene);} # eg. C49A1.4 THIS IS THE GENE NAME.
$TAKE{$gene} = 1;
}
close(GFF);
#------------------------------------------------------------------#
# FIND THE TREEFAM ID FOR ALL THE GENES FROM C. ELEGANS THAT I AM INTERESTED IN:
# HASH TABLES TO REMEMBER THE SPECIES OF GENES:
$dbh = DBI->connect("dbi:mysql:treefam_5:db.treefam.org:3308", 'anonymous', '') || return;
# SPECIFY THE TABLE:
$table_w = 'genes';
# THE FIRST COLUMN IN THIS TABLE IS AN IDENTIFIER (IDX),
# ANOTHER COLUMN IS THE TRANSCRIPT NAME, AND THE LAST COLUMN
# IS THE TAXONOMY ID, eg. 1 ENSANGT00000032162.1 7165
$st = "SELECT IDX, TAX_ID, GID from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
%IDX = ();
%YEAST = ();
%CB = ();
%CR = ();
%HUMAN = ();
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array) {
$IDX = $array[0]; # eg., 1
$TAXID = $array[1]; # eg 7165
$GID = $array[2];
# CHECK IF THIS IS A C. ELEGANS GENE THAT WE WANT TO TAKE:
if ($TAKE{$GID})
{
if (!($IDX{$GID})) { $IDX{$GID} = $IDX;}
else {$IDX{$GID} = $IDX{$GID}.",".$IDX;}
}
# RECORD IF IT IS A YEAST GENE:
if ($TAXID eq '4932')
{
$YEAST{$IDX} = $GID;
}
# RECORD IF IT IS A C. BRIGGSAE GENE:
if ($TAXID eq '6238')
{
$CB{$IDX} = $GID;
}
# RECORD IF IT IS A C. REMANEI GENE:
if ($TAXID eq '31234')
{
$CR{$IDX} = $GID;
}
# RECORD IF IT IS A HUMAN GENE:
if ($TAXID eq '9606')
{
$HUMAN{$IDX} = $GID;
}
}
}
# NOW FIND ALL THE ORTHOLOGS FOR THE C. ELEGANS GENES OF INTEREST:
%YEAST_ORTH = ();
%CB_ORTH = ();
%CR_ORTH = ();
%HUMAN_ORTH = ();
foreach $GID (keys %IDX)
{
$IDXS = $IDX{$GID};
@IDXS = split(/\,/,$IDXS);
for ($i = 0; $i <= $#IDXS; $i++)
{
$IDX = $IDXS[$i];
# GET ALL THE ORTHOLGOS FOR $IDX:
$table_w = 'ortholog';
$st = "SELECT idx1,idx2 from $table_w WHERE idx1=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($IDX) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$idx1 = $array[0];
$idx2 = $array[1];
if ($YEAST{$idx2})
{
if (!($YEAST_ORTH{$GID})) { $YEAST_ORTH{$GID} = $YEAST{$idx2};}
else {$YEAST_ORTH{$GID} = $YEAST_ORTH{$GID}.",".$YEAST{$idx2};}
}
if ($CB{$idx2})
{
if (!($CB_ORTH{$GID})) { $CB_ORTH{$GID} = $CB{$idx2};}
else {$CB_ORTH{$GID} = $CB_ORTH{$GID}.",".$CB{$idx2};}
}
if ($CR{$idx2})
{
if (!($CR_ORTH{$GID})) { $CR_ORTH{$GID} = $CR{$idx2};}
else {$CR_ORTH{$GID} = $CR_ORTH{$GID}.",".$CR{$idx2};}
}
if ($HUMAN{$idx2})
{
if (!($HUMAN_ORTH{$GID})) { $HUMAN_ORTH{$GID} = $HUMAN{$idx2};}
else {$HUMAN_ORTH{$GID} = $HUMAN_ORTH{$GID}.",".$HUMAN{$idx2};}
}
}
}
$st = "SELECT idx1,idx2 from $table_w WHERE idx2=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($IDX) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$idx1 = $array[0];
$idx2 = $array[1];
if ($YEAST{$idx1})
{
if (!($YEAST_ORTH{$GID})) { $YEAST_ORTH{$GID} = $YEAST{$idx1};}
else {$YEAST_ORTH{$GID} = $YEAST_ORTH{$GID}.",".$YEAST{$idx1};}
}
if ($CB{$idx1})
{
if (!($CB_ORTH{$GID})) { $CB_ORTH{$GID} = $CB{$idx1};}
else {$CB_ORTH{$GID} = $CB_ORTH{$GID}.",".$CB{$idx1};}
}
if ($CR{$idx1})
{
if (!($CR_ORTH{$GID})) { $CR_ORTH{$GID} = $CR{$idx1};}
else {$CR_ORTH{$GID} = $CR_ORTH{$GID}.",".$CR{$idx1};}
}
if ($HUMAN{$idx1})
{
if (!($HUMAN_ORTH{$GID})) { $HUMAN_ORTH{$GID} = $HUMAN{$idx1};}
else {$HUMAN_ORTH{$GID} = $HUMAN_ORTH{$GID}.",".$HUMAN{$idx1};}
}
}
}
}
}
#------------------------------------------------------------------#
# READ IN THE INPUT GFF FILE:
print "GENE CBORTH CRORTH HUMORTH YEASTORTH\n";
%SEEN = ();
open(GFF,"$gff") || die "ERROR: cannot open $gff.\n";
while(<GFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
$exon = $temp[8];
# FIND THE GENE NAME:
# eg. ID=CDS:Transcript:C49A1.4b.1;Name=C49A1.4b.1;Parent=Transcript:C49A1.4b.1
@temp = split(/\;/,$exon);
$gene = $temp[1]; # Name=C49A1.4b.1 THIS IS THE TRANSCRIPT NAME.
@temp = split(/\./,$gene);
$gene = $temp[0].".".$temp[1]; # eg. Name=C49A1.4b
@temp = split(/=/,$gene);
$gene = $temp[1]; # eg. C49A1.4b
$gene =~ tr/[a-z]/[A-Z]/;
if (substr($gene,length($gene)-1,1) =~ /[A-Z]/) { chop($gene);} # eg. C49A1.4 THIS IS THE GENE NAME.
# SEE IF THIS GENE HAS C. BRIGGSAE, C. REMANEI, HUMAN AND YEAST ORTHOLOGS:
if ($CB_ORTH{$gene}) { $cb_orth = $CB_ORTH{$gene};}
else { $cb_orth = "none"; }
if ($CR_ORTH{$gene}) { $cr_orth = $CR_ORTH{$gene};}
else { $cr_orth = "none"; }
if ($HUMAN_ORTH{$gene}) { $human_orth = $HUMAN_ORTH{$gene};}
else { $human_orth = "none"; }
if ($YEAST_ORTH{$gene}) { $yeast_orth = $YEAST_ORTH{$gene};}
else { $yeast_orth = "none"; }
if (!($SEEN{$gene}))
{
$SEEN{$gene} = 1;
print "$gene $cb_orth $cr_orth $human_orth $yeast_orth\n";
}
}
close(GFF);
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment