Created
March 1, 2013 13:33
-
-
Save avrilcoghlan/5064688 to your computer and use it in GitHub Desktop.
Perl script that retrieves orthologs for a particular pair of species from the TreeFam database, and checks whether the ortholog pair in the two species is flanked by left-hand and right-hand neighbours that are also orthologous
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_synteny3.pl | |
# Written by Avril Coghlan (alc@sanger.ac.uk) | |
# 12-Sept-06. | |
# | |
# For the TreeFam project. | |
# | |
# This perl script reads in the positions of genes from the TreeFam | |
# database, and their orthologs; and checks whether orthology agrees | |
# with conserved gene order. This perl script uses Jean-Karim's TreeFam | |
# api. | |
# | |
# The command-line format is: | |
# % perl <treefam_synteny3.pl> <species1> <species2> | |
# where <species1> is the UniProt identifier for species1, | |
# <species2> is the UniProt identifier for species2. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 2) | |
{ | |
print "Usage of treefam_synteny3.pl\n\n"; | |
print "perl treefam_synteny3.pl <species1> <species2>\n"; | |
print "where <species1> is the UniProt identifier for species 1,\n"; | |
print " <species2> is the UniProt identifier for species 2.\n"; | |
print "For example, >perl treefam_synteny3.pl CAEEL CAEBR\n"; | |
exit; | |
} | |
# DECLARE MYSQL USERNAME AND HOST: | |
use strict; | |
use Treefam::DBConnection; | |
my $VERBOSE = 0; # IF $VERBOSE==1, PRINT OUT EXTRA DETAILS. | |
my $dbc = Treefam::DBConnection->new(); | |
# FIND THE UNIPROT IDENTIFIER FOR SPECIES 1: | |
my $species1 = $ARGV[0]; | |
# FIND THE UNIPROT IDENTIFIER FOR SPECIES 2: | |
my $species2 = $ARGV[1]; | |
# IF $VERBOSE == 1, PRINT OUT EXTRA INFORMATION: | |
$VERBOSE = 1; | |
#------------------------------------------------------------------# | |
# GET A LIST OF THE GENES IN SPECIES 1 AND SPECIES 2: | |
my $gh = $dbc->get_GeneHandle(); | |
my @species1_genes = $gh->get_all_by_species($species1); | |
my @species2_genes = $gh->get_all_by_species($species2); | |
if ($VERBOSE == 1) { print STDERR "Got list of genes for both species...\n";} | |
# GET THE LONG SPECIES NAMES: | |
my $species1_gene = $species1_genes[0]; | |
my $species1_name = $species1_gene->species; | |
my $species2_gene = $species2_genes[0]; | |
my $species2_name = $species2_gene->species; | |
# GET THE CHROMOSOMES AND CODING START POSITIONS OF ALL THE GENES IN SPECIES 1 AND 2: | |
my %START = (); # HASH TABLE FOR RECORDING START-POINTS OF GENES. | |
my %CHROM = (); # HASH TABLE FOR RECORDING CHROMOSOMES OF GENES. | |
my $no_genes = 0; | |
foreach my $gene(@species1_genes,@species2_genes) | |
{ | |
$no_genes++; | |
my ($geneID) = $gene->ID; | |
# FIND THE CHROMOSOME AND CODING START POSITION OF EACH GENE: | |
my $chrom = $gene->chrom; | |
my $pos = $gene->pos; | |
$START{$geneID} = $pos; | |
$CHROM{$geneID} = $chrom; | |
if ($no_genes % 1000 == 0) | |
{ | |
print STDERR "$no_genes: $geneID $chrom $pos ...\n"; | |
} | |
} | |
if ($VERBOSE == 1) { print STDERR "Got gene starts for both species...\n";} | |
# SORT THE GENES IN EACH SPECIES BY CODING START-POSITION ALONG THE CHROMOSOME: | |
@species1_genes = sort { $START{$a->ID} <=> $START{$b->ID} } @species1_genes; | |
@species2_genes = sort { $START{$a->ID} <=> $START{$b->ID} } @species2_genes; | |
if ($VERBOSE == 1) { print STDERR "Sorted genes by start-point for both species...\n";} | |
# NUMBER GENES IN EACH SPECIES, FROM 1....n ALONG FROM START TO FINISH ALONG | |
# EACH CHROMOSOME (eg. 1...n ON CHROMOSOME 1, n+1....m ON CHROMOSOME 2, etc.) | |
# $CHROM_CNT1 HAS THE NUMBER OF EACH GENE ALONG EACH CHROMOSOME IN SPECIES 1. | |
# $chroms1 IS AN ARRAY OF THE CHROMOSOME NAMES IN SPECIES 1. | |
(my $CHROM_CNT1,my $chroms1) = &number_genes(\@species1_genes,\%CHROM); | |
(my $CHROM_CNT2,my $chroms2) = &number_genes(\@species2_genes,\%CHROM); | |
if ($VERBOSE == 1) { print STDERR "Recorded the number of each gene along each chromosome...\n";} | |
# CHECK WHETHER THE ORTHOLOG INFORMATION IN TREEFAM IS CONSISTENT WITH | |
# GENE ORDER: | |
&check_if_gene_order_and_trees_agree($CHROM_CNT1,$chroms1,$CHROM_CNT2,$chroms2,\%CHROM); | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# CHECK WHETHER THE ORTHOLOG INFORMATION IN TREEFAM IS CONSISTENT WITH | |
# GENE ORDER. MOVE ALONG EACH CHROMOSOME IN SPECIES 1, AND IF GENE B IN SPECIES | |
# 1 WITH NEIGHBORS A,C WITH ORTHOLOGS A',C' THAT ARE ONE GENE APART IN SPECIES 2, | |
# THEN SEE IF GENE B HAS ORTHOLOG B' IN SPECIES 2 WHERE B IS BETWEEN A',C': | |
# 3-AUG-06. | |
sub check_if_gene_order_and_trees_agree | |
{ | |
my $CHROM_CNT1 = $_[0]; # HASH TABLE WITH THE NUMBER OF EACH GENE ALONG EACH CHROMOSOME | |
# IN SPECIES 1. | |
my $chroms1 = $_[1]; # AN ARRAY OF THE CHROMOSOMES IN SPECIES 1. | |
my $CHROM_CNT2 = $_[2]; # HASH TABLE WITH THE NUMBER OF EACH GENE ALONG EACH CHROMOSOME | |
# IN SPECIES 2. | |
my $chroms2 = $_[3]; # AN ARRAY OF THE CHROMOSOMES IN SPECIES 2. | |
my $CHROM = $_[4]; # HASH TABLE OF THE CHROMOSOME THAT A GENE IS ON. | |
my $chrom2; # | |
my $chrom1; # | |
my $i; # | |
my $j; # | |
my $chrom_end = 0; # | |
my $gene1; # | |
my $prev_gene1; # THE GENE BEFORE $gene1 IN SPECIES 1. | |
my $next_gene1; # THE GENE AFTER $gene1 IN SPECIES 1. | |
my $prev_gene2; # THE ORTHOLOG OF $prev_gene1. | |
my $gene2; # THE ORTHOLOG OF $gene1. | |
my $next_gene2; # THE ORTHOLOG OF $next_gene1. | |
my $gene1_cnt; # THE NUMBER ALONG THE CHROMOSOME FOR $gene1. | |
my $prev_gene1_cnt; # | |
my $next_gene1_cnt; # | |
my $gene2_cnt; # | |
my $prev_gene2_cnt; # | |
my $next_gene2_cnt; # | |
my $gene3; # | |
my $gene3_cnt; # | |
my @orthologs; # | |
my $ortholog; # | |
my $k; # | |
# FOR EACH CHROMOSOME IN SPECIES 1, FIND TRIOS OF GENES A,B,C AND CHECK | |
# WHETHER THE ORTHOLOGS A',C' OF A,C ARE ONE GENE APART IN SPECIES 2, AND IF SO, | |
# CHECK IF B IS ORTHOLOGOUS TO GENE B' (THE GENE BETWEEN A',C'): | |
for ($i = 0; $i < @$chroms1; $i++) | |
{ | |
$chrom1 = $chroms1->[$i]; | |
$j = 0; | |
$chrom_end = 0; | |
while ($chrom_end == 0) # IF WE HAVE NOT YET REACHED THE END OF THE CHROMOSOME. | |
{ | |
$j++; | |
if ($CHROM_CNT1->{$chrom1."=".$j}) # IF WE KNOW THE GENE CORRESPONDING TO THIS NUMBER ALONG THE CHROMOSOME. | |
{ | |
$gene1 = $CHROM_CNT1->{$chrom1."=".$j}; | |
$gene1_cnt = $j; | |
if ($VERBOSE == 1) { print "Chromosome $chrom1 gene ",$gene1->ID," count $gene1_cnt ...\n";} | |
# CHECK IF WE KNOW THE GENE BEFORE $gene1 AND AFTER $gene1 IN SPECIES 1. THIS | |
# SHOULD BE TRUE FOR ALL GENES EXCEPT THE FIRST AND LAST GENES ON A CHROMOSOME. | |
$prev_gene1_cnt= $j-1; | |
$next_gene1_cnt= $j+1; | |
if ($CHROM_CNT1->{$chrom1."=".$prev_gene1_cnt} && $CHROM_CNT1->{$chrom1."=".$next_gene1_cnt}) | |
{ | |
$prev_gene1 = $CHROM_CNT1->{$chrom1."=".$prev_gene1_cnt}; | |
$next_gene1 = $CHROM_CNT1->{$chrom1."=".$next_gene1_cnt}; | |
if ($VERBOSE == 1) { print "___ species 1: ", $prev_gene1->ID," (cnt $prev_gene1_cnt) ",$gene1->ID," (cnt $gene1_cnt) ",$next_gene1->ID," (cnt $next_gene1_cnt)\n";} | |
# SEE IF WE KNOW THE ORTHOLOGS OF $prev_gene1, $gene1 AND $next_gene1, INFERRED | |
# FROM PHYLOGENETIC TREES IN TREEFAM. NOTE THAT ONE GENE CAN HAVE SEVERAL ORTHOLOGS | |
# IN THE OTHER SPECIES, WHICH WILL BE A LIST, eg. 'T06C10.2,Y73C8C.9' | |
if (defined($prev_gene1->orthologs($species2_name))) | |
{ | |
@orthologs = $prev_gene1->orthologs($species2_name); | |
if ($#orthologs >= 0) | |
{ | |
$prev_gene2 = ""; | |
for ($k = 0; $k <= $#orthologs; $k++) { $ortholog = $orthologs[$k]; $prev_gene2 = $prev_gene2.",".$ortholog->ID;} | |
$prev_gene2 = substr($prev_gene2,1,length($prev_gene2)-1); | |
} | |
else | |
{ | |
$prev_gene2 = "none"; | |
} | |
} | |
else | |
{ | |
$prev_gene2 = "none"; | |
} | |
if (defined($gene1->orthologs($species2_name))) | |
{ | |
@orthologs = $gene1->orthologs($species2_name); | |
if ($#orthologs >= 0) | |
{ | |
$gene2 = ""; | |
for ($k = 0; $k <= $#orthologs; $k++) { $ortholog = $orthologs[$k]; $gene2 = $gene2.",".$ortholog->ID;} | |
$gene2 = substr($gene2,1,length($gene2)-1); | |
} | |
else | |
{ | |
$gene2 = "none"; | |
} | |
} | |
else | |
{ | |
$gene2 = "none"; | |
} | |
if (defined($next_gene1->orthologs($species2_name))) | |
{ | |
@orthologs = $next_gene1->orthologs($species2_name); | |
if ($#orthologs >= 0) | |
{ | |
$next_gene2 = ""; | |
for ($k = 0; $k <= $#orthologs; $k++) { $ortholog = $orthologs[$k]; $next_gene2 = $next_gene2.",".$ortholog->ID;} | |
$next_gene2 = substr($next_gene2,1,length($next_gene2)-1); | |
} | |
else | |
{ | |
$next_gene2 = "none"; | |
} | |
} | |
else | |
{ | |
$next_gene2 = "none"; | |
} | |
# CHECK IF THE GENES $prev_gene2 AND $next_gene2 ARE ALL ON THE SAME | |
# CHROMOSOME IN SPECIES 2: | |
if (!($CHROM->{$prev_gene2})) { $CHROM->{$prev_gene2} = 'none'; } | |
if (!($CHROM->{$gene2})) { $CHROM->{$gene2} = 'none'; } | |
if (!($CHROM->{$next_gene2})) { $CHROM->{$next_gene2} = 'none'; } | |
if ($VERBOSE == 1) { print "___ species 2: $prev_gene2 ($CHROM->{$prev_gene2}) $gene2 ($CHROM->{$gene2}) $next_gene2 ($CHROM->{$next_gene2})\n";} | |
if ($CHROM->{$prev_gene2} eq $CHROM->{$next_gene2} && $CHROM->{$prev_gene2} ne 'none') | |
{ | |
$chrom2 = $CHROM->{$prev_gene2}; | |
# FIND THE NUMBER ALONG THE CHROMOSOMES FOR THE GENES $prev_gene2, $gene2 AND | |
# $next_gene2 IN SPECIES 2. IF THERE ARE SEVERAL ORTHOLOGS FOR A SPECIES-1 GENE | |
# IN SPECIES 2, THEN THE CHROMOSOME NUMBER OF THE SPECIES 2 ORTHOLOG WILL BE UNDEFINED | |
# (SET TO 'none' HERE): | |
if ($CHROM_CNT2->{$chrom2."=".$prev_gene2}) { $prev_gene2_cnt = $CHROM_CNT2->{$chrom2."=".$prev_gene2};} | |
else { $prev_gene2_cnt = 'none'; } | |
if ($CHROM_CNT2->{$chrom2."=".$gene2}) { $gene2_cnt = $CHROM_CNT2->{$chrom2."=".$gene2}; } | |
else { $gene2_cnt = 'none'; } | |
if ($CHROM_CNT2->{$chrom2."=".$next_gene2}) { $next_gene2_cnt = $CHROM_CNT2->{$chrom2."=".$next_gene2};} | |
else { $next_gene2_cnt = 'none'; } | |
if ($VERBOSE == 1) { print "___ species 2: $prev_gene2 (cnt $prev_gene2_cnt) $gene2 (cnt $gene2_cnt) $next_gene2 (cnt $next_gene2_cnt)\n";} | |
# CHECK WHETHER THE ORTHOLOGS INFERRED FROM GENE ORDER AND PHYLOGENETIC TREES AGREE: | |
if ($prev_gene2_cnt ne 'none' && $next_gene2_cnt ne 'none') | |
# IF WE HAVE INFERRED ONE ORTHOLOG FOR $prev_gene1 AND ONE ORTHOLOG FOR $next_gene1, IN SPECIES 2: | |
{ | |
if ($prev_gene2_cnt - $next_gene2_cnt == 2 || $next_gene2_cnt - $prev_gene2_cnt == 2) | |
{ | |
# GENES $prev_gene1 AND $next_gene1 ARE ONE GENE APART IN SPECIES 1, AND | |
# THEIR ORTHOLOGS $prev_gene2 AND $next_gene2 ARE ONE GENE APART IN SPECIES 2: | |
if ($gene2_cnt eq 'none' && $gene2 ne 'none') | |
{ | |
# A SINGLE ORTHOLOG IS INFERRED FROM GENE ORDER BUT SEVERAL FROM PHYLOGENETIC TREES, OR | |
# $gene2 IS NOT ON THE SAME CHROMOSOME AS $prev_gene2: | |
$gene3_cnt = ($prev_gene2_cnt + $next_gene2_cnt)/2; | |
if (!($CHROM_CNT2->{$chrom2."=".$gene3_cnt})) { print STDERR "ERROR: do not know gene $gene3_cnt.\n"; exit;} | |
$gene3 = $CHROM_CNT2->{$chrom2."=".$gene3_cnt}; | |
print "DISAGREE: the positional? ortholog for ",$gene1->ID," inferred from gene order is ",$gene3->ID,", but sev orths $gene2 from trees!!!\n"; | |
} | |
elsif ($gene2_cnt eq 'none' && $gene2 eq 'none') | |
{ | |
# AN ORTHOLOG IS INFERRED FROM GENE ORDER BUT NONE FROM PHYLOGENETIC TREES: | |
$gene3_cnt = ($prev_gene2_cnt + $next_gene2_cnt)/2; | |
if (!($CHROM_CNT2->{$chrom2."=".$gene3_cnt})) { print STDERR "ERROR: do not know gene $gene3_cnt.\n"; exit;} | |
$gene3 = $CHROM_CNT2->{$chrom2."=".$gene3_cnt}; | |
print "DISAGREE: the distant? ortholog for ",$gene1->ID," inferred from gene order is ",$gene3->ID,", but $gene2 from trees!!!\n"; | |
} | |
elsif ($gene2_cnt == ($prev_gene2_cnt + $next_gene2_cnt)/2) | |
{ | |
# THE ORTHOLOGS INFERRED FROM GENE ORDER AGREE WITH THOSE FROM PHYLOGENETIC TREES: | |
print "AGREE: the reliable ortholog for ",$gene1->ID," inferred from both gene order and trees is $gene2\n"; | |
} | |
else | |
{ | |
# THE ORTHOLOGS INFERRED FROM GENE ORDER DISAGREE WITH THOSE FROM PHYLOGENETIC TREES: | |
$gene3_cnt = ($prev_gene2_cnt + $next_gene2_cnt)/2; | |
if (!($CHROM_CNT2->{$chrom2."=".$gene3_cnt})) { print STDERR "ERROR: do not know gene $gene3_cnt.\n"; exit;} | |
$gene3 = $CHROM_CNT2->{$chrom2."=".$gene3_cnt}; | |
print "DISAGREE: the ortholog for ",$gene1->ID," inferred from gene order is ",$gene3->ID,", but from trees is $gene2!!!\n"; | |
} | |
} | |
} | |
} | |
} | |
} | |
else # WE HAVE REACHED THE END OF THE CHROMOSOME. | |
{ | |
$chrom_end = 1; | |
goto CHROM_END; | |
} | |
} | |
CHROM_END: | |
} | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE TO NUMBER GENES IN EACH SPECIES, FROM 1....n ALONG FROM START TO FINISH ALONG | |
# EACH CHROMOSOME (eg. 1...n ON CHROMOSOME 1, n+1....m ON CHROMOSOME 2, etc.) | |
# 3-AUG-06. | |
sub number_genes | |
{ | |
my $genes = $_[0]; # A POINTER TO A LIST OF GENES IN THIS SPECIES, | |
# SORTED BY CODING START POINT. | |
my $CHROM = $_[1]; # A POINTER TO A HASH TABLE OF CHROMOSOMES FOR GENES. | |
my $i; # | |
my $gene; # | |
my $chrom; # | |
my %CHROM_CNT = (); # HASH TABLE TO RECORD THE NUMBER OF EACH GENE ALONG EACH CHROMOSOME. | |
my @chroms; # ARRAY OF THE CHROMOSOME IN THIS SPECIES. | |
my $no_genes; # NUMBER OF GENES IN A CERTAIN CHROMOSOME. | |
my %SEEN = (); # | |
my $geneID; # | |
for ($i = 0; $i < @$genes; $i++) | |
{ | |
$gene = $genes->[$i]; | |
$geneID = $gene->ID; | |
if (!($CHROM->{$geneID})) { print STDERR "ERROR: do not know chromosome for $geneID.\n"; exit;} | |
$chrom = $CHROMhttps://gist.github.com/avrilcoghlan/5064651->{$geneID}; | |
if (!($SEEN{$chrom})) | |
{ | |
@chroms = (@chroms,$chrom); | |
$SEEN{$chrom} = 1; | |
} | |
# FIND THE NUMBER OF GENES SEEN IN CHROMOSOME $chrom: | |
if (!($CHROM_CNT{$chrom})) { $CHROM_CNT{$chrom} = 1; $no_genes = $CHROM_CNT{$chrom};} | |
else { $CHROM_CNT{$chrom}++; $no_genes = $CHROM_CNT{$chrom};} | |
# RECORD THE NUMBER OF THIS GENE ALONG CHROMOSOME $chrom: | |
$CHROM_CNT{$chrom."=".$no_genes}= $gene; | |
$CHROM_CNT{$chrom."=".$geneID}= $no_genes; | |
} | |
return(\%CHROM_CNT,\@chroms); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment