Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 13:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5064688 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5064688 to your computer and use it in GitHub Desktop.
Perl script that retrieves orthologs for a particular pair of species from the TreeFam database, and checks whether the ortholog pair in the two species is flanked by left-hand and right-hand neighbours that are also orthologous
#!/usr/local/bin/perl
#
# Perl script treefam_synteny3.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 12-Sept-06.
#
# For the TreeFam project.
#
# This perl script reads in the positions of genes from the TreeFam
# database, and their orthologs; and checks whether orthology agrees
# with conserved gene order. This perl script uses Jean-Karim's TreeFam
# api.
#
# The command-line format is:
# % perl <treefam_synteny3.pl> <species1> <species2>
# where <species1> is the UniProt identifier for species1,
# <species2> is the UniProt identifier for species2.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 2)
{
print "Usage of treefam_synteny3.pl\n\n";
print "perl treefam_synteny3.pl <species1> <species2>\n";
print "where <species1> is the UniProt identifier for species 1,\n";
print " <species2> is the UniProt identifier for species 2.\n";
print "For example, >perl treefam_synteny3.pl CAEEL CAEBR\n";
exit;
}
# DECLARE MYSQL USERNAME AND HOST:
use strict;
use Treefam::DBConnection;
my $VERBOSE = 0; # IF $VERBOSE==1, PRINT OUT EXTRA DETAILS.
my $dbc = Treefam::DBConnection->new();
# FIND THE UNIPROT IDENTIFIER FOR SPECIES 1:
my $species1 = $ARGV[0];
# FIND THE UNIPROT IDENTIFIER FOR SPECIES 2:
my $species2 = $ARGV[1];
# IF $VERBOSE == 1, PRINT OUT EXTRA INFORMATION:
$VERBOSE = 1;
#------------------------------------------------------------------#
# GET A LIST OF THE GENES IN SPECIES 1 AND SPECIES 2:
my $gh = $dbc->get_GeneHandle();
my @species1_genes = $gh->get_all_by_species($species1);
my @species2_genes = $gh->get_all_by_species($species2);
if ($VERBOSE == 1) { print STDERR "Got list of genes for both species...\n";}
# GET THE LONG SPECIES NAMES:
my $species1_gene = $species1_genes[0];
my $species1_name = $species1_gene->species;
my $species2_gene = $species2_genes[0];
my $species2_name = $species2_gene->species;
# GET THE CHROMOSOMES AND CODING START POSITIONS OF ALL THE GENES IN SPECIES 1 AND 2:
my %START = (); # HASH TABLE FOR RECORDING START-POINTS OF GENES.
my %CHROM = (); # HASH TABLE FOR RECORDING CHROMOSOMES OF GENES.
my $no_genes = 0;
foreach my $gene(@species1_genes,@species2_genes)
{
$no_genes++;
my ($geneID) = $gene->ID;
# FIND THE CHROMOSOME AND CODING START POSITION OF EACH GENE:
my $chrom = $gene->chrom;
my $pos = $gene->pos;
$START{$geneID} = $pos;
$CHROM{$geneID} = $chrom;
if ($no_genes % 1000 == 0)
{
print STDERR "$no_genes: $geneID $chrom $pos ...\n";
}
}
if ($VERBOSE == 1) { print STDERR "Got gene starts for both species...\n";}
# SORT THE GENES IN EACH SPECIES BY CODING START-POSITION ALONG THE CHROMOSOME:
@species1_genes = sort { $START{$a->ID} <=> $START{$b->ID} } @species1_genes;
@species2_genes = sort { $START{$a->ID} <=> $START{$b->ID} } @species2_genes;
if ($VERBOSE == 1) { print STDERR "Sorted genes by start-point for both species...\n";}
# NUMBER GENES IN EACH SPECIES, FROM 1....n ALONG FROM START TO FINISH ALONG
# EACH CHROMOSOME (eg. 1...n ON CHROMOSOME 1, n+1....m ON CHROMOSOME 2, etc.)
# $CHROM_CNT1 HAS THE NUMBER OF EACH GENE ALONG EACH CHROMOSOME IN SPECIES 1.
# $chroms1 IS AN ARRAY OF THE CHROMOSOME NAMES IN SPECIES 1.
(my $CHROM_CNT1,my $chroms1) = &number_genes(\@species1_genes,\%CHROM);
(my $CHROM_CNT2,my $chroms2) = &number_genes(\@species2_genes,\%CHROM);
if ($VERBOSE == 1) { print STDERR "Recorded the number of each gene along each chromosome...\n";}
# CHECK WHETHER THE ORTHOLOG INFORMATION IN TREEFAM IS CONSISTENT WITH
# GENE ORDER:
&check_if_gene_order_and_trees_agree($CHROM_CNT1,$chroms1,$CHROM_CNT2,$chroms2,\%CHROM);
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# CHECK WHETHER THE ORTHOLOG INFORMATION IN TREEFAM IS CONSISTENT WITH
# GENE ORDER. MOVE ALONG EACH CHROMOSOME IN SPECIES 1, AND IF GENE B IN SPECIES
# 1 WITH NEIGHBORS A,C WITH ORTHOLOGS A',C' THAT ARE ONE GENE APART IN SPECIES 2,
# THEN SEE IF GENE B HAS ORTHOLOG B' IN SPECIES 2 WHERE B IS BETWEEN A',C':
# 3-AUG-06.
sub check_if_gene_order_and_trees_agree
{
my $CHROM_CNT1 = $_[0]; # HASH TABLE WITH THE NUMBER OF EACH GENE ALONG EACH CHROMOSOME
# IN SPECIES 1.
my $chroms1 = $_[1]; # AN ARRAY OF THE CHROMOSOMES IN SPECIES 1.
my $CHROM_CNT2 = $_[2]; # HASH TABLE WITH THE NUMBER OF EACH GENE ALONG EACH CHROMOSOME
# IN SPECIES 2.
my $chroms2 = $_[3]; # AN ARRAY OF THE CHROMOSOMES IN SPECIES 2.
my $CHROM = $_[4]; # HASH TABLE OF THE CHROMOSOME THAT A GENE IS ON.
my $chrom2; #
my $chrom1; #
my $i; #
my $j; #
my $chrom_end = 0; #
my $gene1; #
my $prev_gene1; # THE GENE BEFORE $gene1 IN SPECIES 1.
my $next_gene1; # THE GENE AFTER $gene1 IN SPECIES 1.
my $prev_gene2; # THE ORTHOLOG OF $prev_gene1.
my $gene2; # THE ORTHOLOG OF $gene1.
my $next_gene2; # THE ORTHOLOG OF $next_gene1.
my $gene1_cnt; # THE NUMBER ALONG THE CHROMOSOME FOR $gene1.
my $prev_gene1_cnt; #
my $next_gene1_cnt; #
my $gene2_cnt; #
my $prev_gene2_cnt; #
my $next_gene2_cnt; #
my $gene3; #
my $gene3_cnt; #
my @orthologs; #
my $ortholog; #
my $k; #
# FOR EACH CHROMOSOME IN SPECIES 1, FIND TRIOS OF GENES A,B,C AND CHECK
# WHETHER THE ORTHOLOGS A',C' OF A,C ARE ONE GENE APART IN SPECIES 2, AND IF SO,
# CHECK IF B IS ORTHOLOGOUS TO GENE B' (THE GENE BETWEEN A',C'):
for ($i = 0; $i < @$chroms1; $i++)
{
$chrom1 = $chroms1->[$i];
$j = 0;
$chrom_end = 0;
while ($chrom_end == 0) # IF WE HAVE NOT YET REACHED THE END OF THE CHROMOSOME.
{
$j++;
if ($CHROM_CNT1->{$chrom1."=".$j}) # IF WE KNOW THE GENE CORRESPONDING TO THIS NUMBER ALONG THE CHROMOSOME.
{
$gene1 = $CHROM_CNT1->{$chrom1."=".$j};
$gene1_cnt = $j;
if ($VERBOSE == 1) { print "Chromosome $chrom1 gene ",$gene1->ID," count $gene1_cnt ...\n";}
# CHECK IF WE KNOW THE GENE BEFORE $gene1 AND AFTER $gene1 IN SPECIES 1. THIS
# SHOULD BE TRUE FOR ALL GENES EXCEPT THE FIRST AND LAST GENES ON A CHROMOSOME.
$prev_gene1_cnt= $j-1;
$next_gene1_cnt= $j+1;
if ($CHROM_CNT1->{$chrom1."=".$prev_gene1_cnt} && $CHROM_CNT1->{$chrom1."=".$next_gene1_cnt})
{
$prev_gene1 = $CHROM_CNT1->{$chrom1."=".$prev_gene1_cnt};
$next_gene1 = $CHROM_CNT1->{$chrom1."=".$next_gene1_cnt};
if ($VERBOSE == 1) { print "___ species 1: ", $prev_gene1->ID," (cnt $prev_gene1_cnt) ",$gene1->ID," (cnt $gene1_cnt) ",$next_gene1->ID," (cnt $next_gene1_cnt)\n";}
# SEE IF WE KNOW THE ORTHOLOGS OF $prev_gene1, $gene1 AND $next_gene1, INFERRED
# FROM PHYLOGENETIC TREES IN TREEFAM. NOTE THAT ONE GENE CAN HAVE SEVERAL ORTHOLOGS
# IN THE OTHER SPECIES, WHICH WILL BE A LIST, eg. 'T06C10.2,Y73C8C.9'
if (defined($prev_gene1->orthologs($species2_name)))
{
@orthologs = $prev_gene1->orthologs($species2_name);
if ($#orthologs >= 0)
{
$prev_gene2 = "";
for ($k = 0; $k <= $#orthologs; $k++) { $ortholog = $orthologs[$k]; $prev_gene2 = $prev_gene2.",".$ortholog->ID;}
$prev_gene2 = substr($prev_gene2,1,length($prev_gene2)-1);
}
else
{
$prev_gene2 = "none";
}
}
else
{
$prev_gene2 = "none";
}
if (defined($gene1->orthologs($species2_name)))
{
@orthologs = $gene1->orthologs($species2_name);
if ($#orthologs >= 0)
{
$gene2 = "";
for ($k = 0; $k <= $#orthologs; $k++) { $ortholog = $orthologs[$k]; $gene2 = $gene2.",".$ortholog->ID;}
$gene2 = substr($gene2,1,length($gene2)-1);
}
else
{
$gene2 = "none";
}
}
else
{
$gene2 = "none";
}
if (defined($next_gene1->orthologs($species2_name)))
{
@orthologs = $next_gene1->orthologs($species2_name);
if ($#orthologs >= 0)
{
$next_gene2 = "";
for ($k = 0; $k <= $#orthologs; $k++) { $ortholog = $orthologs[$k]; $next_gene2 = $next_gene2.",".$ortholog->ID;}
$next_gene2 = substr($next_gene2,1,length($next_gene2)-1);
}
else
{
$next_gene2 = "none";
}
}
else
{
$next_gene2 = "none";
}
# CHECK IF THE GENES $prev_gene2 AND $next_gene2 ARE ALL ON THE SAME
# CHROMOSOME IN SPECIES 2:
if (!($CHROM->{$prev_gene2})) { $CHROM->{$prev_gene2} = 'none'; }
if (!($CHROM->{$gene2})) { $CHROM->{$gene2} = 'none'; }
if (!($CHROM->{$next_gene2})) { $CHROM->{$next_gene2} = 'none'; }
if ($VERBOSE == 1) { print "___ species 2: $prev_gene2 ($CHROM->{$prev_gene2}) $gene2 ($CHROM->{$gene2}) $next_gene2 ($CHROM->{$next_gene2})\n";}
if ($CHROM->{$prev_gene2} eq $CHROM->{$next_gene2} && $CHROM->{$prev_gene2} ne 'none')
{
$chrom2 = $CHROM->{$prev_gene2};
# FIND THE NUMBER ALONG THE CHROMOSOMES FOR THE GENES $prev_gene2, $gene2 AND
# $next_gene2 IN SPECIES 2. IF THERE ARE SEVERAL ORTHOLOGS FOR A SPECIES-1 GENE
# IN SPECIES 2, THEN THE CHROMOSOME NUMBER OF THE SPECIES 2 ORTHOLOG WILL BE UNDEFINED
# (SET TO 'none' HERE):
if ($CHROM_CNT2->{$chrom2."=".$prev_gene2}) { $prev_gene2_cnt = $CHROM_CNT2->{$chrom2."=".$prev_gene2};}
else { $prev_gene2_cnt = 'none'; }
if ($CHROM_CNT2->{$chrom2."=".$gene2}) { $gene2_cnt = $CHROM_CNT2->{$chrom2."=".$gene2}; }
else { $gene2_cnt = 'none'; }
if ($CHROM_CNT2->{$chrom2."=".$next_gene2}) { $next_gene2_cnt = $CHROM_CNT2->{$chrom2."=".$next_gene2};}
else { $next_gene2_cnt = 'none'; }
if ($VERBOSE == 1) { print "___ species 2: $prev_gene2 (cnt $prev_gene2_cnt) $gene2 (cnt $gene2_cnt) $next_gene2 (cnt $next_gene2_cnt)\n";}
# CHECK WHETHER THE ORTHOLOGS INFERRED FROM GENE ORDER AND PHYLOGENETIC TREES AGREE:
if ($prev_gene2_cnt ne 'none' && $next_gene2_cnt ne 'none')
# IF WE HAVE INFERRED ONE ORTHOLOG FOR $prev_gene1 AND ONE ORTHOLOG FOR $next_gene1, IN SPECIES 2:
{
if ($prev_gene2_cnt - $next_gene2_cnt == 2 || $next_gene2_cnt - $prev_gene2_cnt == 2)
{
# GENES $prev_gene1 AND $next_gene1 ARE ONE GENE APART IN SPECIES 1, AND
# THEIR ORTHOLOGS $prev_gene2 AND $next_gene2 ARE ONE GENE APART IN SPECIES 2:
if ($gene2_cnt eq 'none' && $gene2 ne 'none')
{
# A SINGLE ORTHOLOG IS INFERRED FROM GENE ORDER BUT SEVERAL FROM PHYLOGENETIC TREES, OR
# $gene2 IS NOT ON THE SAME CHROMOSOME AS $prev_gene2:
$gene3_cnt = ($prev_gene2_cnt + $next_gene2_cnt)/2;
if (!($CHROM_CNT2->{$chrom2."=".$gene3_cnt})) { print STDERR "ERROR: do not know gene $gene3_cnt.\n"; exit;}
$gene3 = $CHROM_CNT2->{$chrom2."=".$gene3_cnt};
print "DISAGREE: the positional? ortholog for ",$gene1->ID," inferred from gene order is ",$gene3->ID,", but sev orths $gene2 from trees!!!\n";
}
elsif ($gene2_cnt eq 'none' && $gene2 eq 'none')
{
# AN ORTHOLOG IS INFERRED FROM GENE ORDER BUT NONE FROM PHYLOGENETIC TREES:
$gene3_cnt = ($prev_gene2_cnt + $next_gene2_cnt)/2;
if (!($CHROM_CNT2->{$chrom2."=".$gene3_cnt})) { print STDERR "ERROR: do not know gene $gene3_cnt.\n"; exit;}
$gene3 = $CHROM_CNT2->{$chrom2."=".$gene3_cnt};
print "DISAGREE: the distant? ortholog for ",$gene1->ID," inferred from gene order is ",$gene3->ID,", but $gene2 from trees!!!\n";
}
elsif ($gene2_cnt == ($prev_gene2_cnt + $next_gene2_cnt)/2)
{
# THE ORTHOLOGS INFERRED FROM GENE ORDER AGREE WITH THOSE FROM PHYLOGENETIC TREES:
print "AGREE: the reliable ortholog for ",$gene1->ID," inferred from both gene order and trees is $gene2\n";
}
else
{
# THE ORTHOLOGS INFERRED FROM GENE ORDER DISAGREE WITH THOSE FROM PHYLOGENETIC TREES:
$gene3_cnt = ($prev_gene2_cnt + $next_gene2_cnt)/2;
if (!($CHROM_CNT2->{$chrom2."=".$gene3_cnt})) { print STDERR "ERROR: do not know gene $gene3_cnt.\n"; exit;}
$gene3 = $CHROM_CNT2->{$chrom2."=".$gene3_cnt};
print "DISAGREE: the ortholog for ",$gene1->ID," inferred from gene order is ",$gene3->ID,", but from trees is $gene2!!!\n";
}
}
}
}
}
}
else # WE HAVE REACHED THE END OF THE CHROMOSOME.
{
$chrom_end = 1;
goto CHROM_END;
}
}
CHROM_END:
}
}
#------------------------------------------------------------------#
# SUBROUTINE TO NUMBER GENES IN EACH SPECIES, FROM 1....n ALONG FROM START TO FINISH ALONG
# EACH CHROMOSOME (eg. 1...n ON CHROMOSOME 1, n+1....m ON CHROMOSOME 2, etc.)
# 3-AUG-06.
sub number_genes
{
my $genes = $_[0]; # A POINTER TO A LIST OF GENES IN THIS SPECIES,
# SORTED BY CODING START POINT.
my $CHROM = $_[1]; # A POINTER TO A HASH TABLE OF CHROMOSOMES FOR GENES.
my $i; #
my $gene; #
my $chrom; #
my %CHROM_CNT = (); # HASH TABLE TO RECORD THE NUMBER OF EACH GENE ALONG EACH CHROMOSOME.
my @chroms; # ARRAY OF THE CHROMOSOME IN THIS SPECIES.
my $no_genes; # NUMBER OF GENES IN A CERTAIN CHROMOSOME.
my %SEEN = (); #
my $geneID; #
for ($i = 0; $i < @$genes; $i++)
{
$gene = $genes->[$i];
$geneID = $gene->ID;
if (!($CHROM->{$geneID})) { print STDERR "ERROR: do not know chromosome for $geneID.\n"; exit;}
$chrom = $CHROMhttps://gist.github.com/avrilcoghlan/5064651->{$geneID};
if (!($SEEN{$chrom}))
{
@chroms = (@chroms,$chrom);
$SEEN{$chrom} = 1;
}
# FIND THE NUMBER OF GENES SEEN IN CHROMOSOME $chrom:
if (!($CHROM_CNT{$chrom})) { $CHROM_CNT{$chrom} = 1; $no_genes = $CHROM_CNT{$chrom};}
else { $CHROM_CNT{$chrom}++; $no_genes = $CHROM_CNT{$chrom};}
# RECORD THE NUMBER OF THIS GENE ALONG CHROMOSOME $chrom:
$CHROM_CNT{$chrom."=".$no_genes}= $gene;
$CHROM_CNT{$chrom."=".$geneID}= $no_genes;
}
return(\%CHROM_CNT,\@chroms);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment