Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5065465 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065465 to your computer and use it in GitHub Desktop.
Perl script that, given a gff file of Caenorhabditis elegans genes, uses TreeFam to check whether adjacent gens are paralogs.
#!/usr/local/bin/perl
#
# Perl script check_if_adjacent_genes_are_paralogs.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 10-Dec-07.
#
# This perls scripts checks whether adjacent genes are paralogs for
# each gene in the ngasp benchmark gene set.
#
# The command-line format is:
# % perl <check_if_adjacent_genes_are_paralogs.pl> gff
# where gff is the input benchmark gff file.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 1)
{
print "Usage of check_if_adjacent_genes_are_paralogs.pl\n\n";
print "perl check_if_adjacent_genes_are_paralogs.pl <gff>\n";
print "where <gff> is the input benchmark gff file.\n";
print "For example, >perl check_if_adjacent_genes_are_paralogs.pl phase1_confirmed_isoforms_C.gff.1\n";
exit;
}
# READ IN MY PERL MODULES:
BEGIN {
unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
}
use lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_3may07/";
use Avril_modules;
use Treefam::DBConnection;
use DBI;
# FIND THE NAME OF THE INPUT GFF FILE:
$gff = $ARGV[0];
#------------------------------------------------------------------#
# FIND ALL PARALOGS OF C. ELEGANS GENES:
# GET A LIST OF ALL FAMILIES IN TREEFAM, INCLUDING TF1 (TREEFAM-A), TF3
# (TREEFAM-B) AND TF5 (TREEFAM-C) FAMILIES:
print STDERR "Finding TreeFam-A families... \n";
$dbh = DBI->connect("dbi:mysql:treefam_5:db.treefam.org:3308", 'anonymous', '') || return;
# GET ALL THE TREEFAM-A FAMILIES:
$table_w = "familyA";
$st = "SELECT AC from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
@families = (@families,$AC);
if ($#families % 100 == 0) { print STDERR "Have $#families now ...\n"; }
}
}
print STDERR "Finding TreeFam-B and TreeFam-C families... \n";
# GET ALL THE TREEFAM-B AND TREEFAM-C FAMILIES:
$table_w = "familyB";
$st = "SELECT AC from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
if ($AC ne 'TF343806') # SOMETHING IS WRONG WITH THIS FAMILY IN TREEFAM-5. xxx
{
@families = (@families,$AC);
}
if ($#families % 100 == 0) { print STDERR "Have $#families now ...\n"; }
}
}
#------------------------------------------------------------------#
$dbc = Treefam::DBConnection->new();
%PARALOG = ();
# FIND PARALOGS WITHIN EACH FAMILY:
foreach my $treefam_family(@families)
{
print STDERR "Looking at family $treefam_family... \n";
$famh = $dbc->get_FamilyHandle();
if (!(defined($famh->get_by_id($treefam_family))))
{
#xxx print "xxx there is no family $treefam_family...\n";
goto NEXT_FAMILY;
}
$family = $famh->get_by_id($treefam_family);
if (!(defined($family->ID())))
{
#xxx print "xxx do not have ID stored for family $treefam_family...\n";
goto NEXT_FAMILY;
}
$AC = $family->ID(); # GET THE FAMILY ID.
if ($AC ne $treefam_family) { print STDERR "ERROR: AC $AC treefam_family $treefam_family\n"; exit;}
$no_families++;
print "\n$no_families: reading family $AC...\n";
# GET THE TREE FOR THE FAMILY:
if (!(defined($family->get_tree('clean'))))
{
#xxx print "xxx AC $AC: there is no tree!\n";
goto NEXT_FAMILY;
}
$tree = $family->get_tree('clean'); # GET THE TREEFAM CLEAN TREE.
# GET ALL THE TRANSCRIPTS IN THIS FAMILY FROM C. ELEGANS:
@species_nodes = &Avril_modules::empty_array(@species_nodes);
@species_genes = &Avril_modules::empty_array(@species_genes);
@nodes = $tree->get_leaves;
foreach $node(@nodes)
{
# FIND THE SPECIES FOR THIS TRANSCRIPT:
if (defined($node->get_tag_value('S')))
{
$species = $node->get_tag_value('S');
}
else
{
#xxx print "xxx AC $AC: do not know species of node\n";
$species = "none";
}
$species =~ tr/[a-z]/[A-Z]/;
if ($species eq 'CAEEL') # IF IT IS A C. ELEGANS GENE.
{
@species_nodes = (@species_nodes,$node);
# FIND THE GENE FOR THIS TRANSCRIPT:
$gh = $node->gene;
$gene = $gh->ID(); # eg. ENSMUSG00000022770
@species_genes = (@species_genes,$gene);
}
}
# FOR EACH PAIR OF C. ELEGANS GENES IN THE TREE, SEE
# WHEN THE DUPLICATION OCCURRED:
for ($i = 0; $i <= $#species_nodes; $i++)
{
$species_gene_i = $species_genes[$i];
for ($j = $i+1; $j <= $#species_nodes; $j++)
{
$species_gene_j = $species_genes[$j];
$pair1 = $species_gene_i."_".$species_gene_j;
$pair2 = $species_gene_j."_".$species_gene_i;
$PARALOG{$pair1} = 1;
$PARALOG{$pair2} = 1;
}
}
NEXT_FAMILY:
}
#------------------------------------------------------------------#
# READ IN THE INPUT GFF FILE:
print "GENE PREVGENE NEXTGENE\n";
%GENES = ();
%SEEN = ();
%START = ();
open(GFF,"$gff") || die "ERROR: cannot open $gff.\n";
while(<GFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
$chr = $temp[0];
$start = $temp[3];
$exon = $temp[8];
# FIND THE GENE NAME:
# eg. ID=CDS:Transcript:C49A1.4b.1;Name=C49A1.4b.1;Parent=Transcript:C49A1.4b.1
@temp = split(/\;/,$exon);
$gene = $temp[1]; # Name=C49A1.4b.1 THIS IS THE TRANSCRIPT NAME.
@temp = split(/\./,$gene);
$gene = $temp[0].".".$temp[1]; # eg. Name=C49A1.4b
@temp = split(/=/,$gene);
$gene = $temp[1]; # eg. C49A1.4b
$gene =~ tr/[a-z]/[A-Z]/;
if (substr($gene,length($gene)-1,1) =~ /[A-Z]/) { chop($gene);} # eg. C49A1.4 THIS IS THE GENE NAME.
# RECORD THE GENE START:
if (!($START{$gene})) { $START{$gene} = $start;}
else { if ($start < $START{$gene}) { $START{$gene} = $start;}}
# RECORD THE GENES ON THIS CHROMOSOME:
if (!($SEEN{$gene}))
{
$SEEN{$gene} = 1;
if (!($GENES{$chr})) { $GENES{$chr} = $gene;}
else {$GENES{$chr} = $GENES{$chr}.",".$gene;}
}
}
close(GFF);
# LOOK AT THE GENES ON EACH CHROMOSOME AT A TIME:
foreach $chr (keys %GENES)
{
$genes = $GENES{$chr};
@genes = split(/\,/,$genes);
@genes = sort by_start @genes;
$prev_gene = "none";
# FOR EACH GENE, FIND THE STRAND OF THE ADJACENT GENES:
for ($i = 0; $i <= $#genes; $i++)
{
$gene = $genes[$i];
# CHECK IF THE PREVIOUS GENE IS A PARALOG OF THIS GENE:
$pair1 = $gene."_".$prev_gene;
$pair2 = $prev_gene."_".$gene;
$prev_paralog = 0;
if ($PARALOG{$pair1}) { $prev_paralog = 1;}
if ($PARALOG{$pair2}) { $prev_paralog = 1;}
# CHECK IF THE NEXT GENE IS A PARALOG OF THIS GENE:
$next_paralog = 0;
if ($i != $#genes)
{
$next_gene = $genes[$i+1];
$pair1 = $gene."_".$next_gene;
$pair2 = $next_gene."_".$gene;
if ($PARALOG{$pair1}) { $next_paralog = 1;}
if ($PARALOG{$pair2}) { $next_paralog = 1;}
}
print "$gene $prev_paralog $next_paralog\n";
$prev_gene = $gene;
}
}
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# SUBROUTINE TO SORT EXONS BY START-POINT:
sub by_start
{
if (!($START{$a})) { print STDERR "ERROR: do not know start of $a.\n"; exit;}
if (!($START{$b})) { print STDERR "ERROR: do not know start of $b.\n"; exit;}
$START{$a} <=> $START{$b}
or
$a cmp $b;
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment