Last active
September 11, 2023 07:34
-
-
Save mdondrup/d4e3d869bb83be14687bfb88172f26ae to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
#### This is now becoming a flexible and chaotic Taxonomy tool ;) | |
use strict; | |
use warnings; | |
use Bio::DB::Taxonomy; | |
use Bio::Taxonomy::Taxon; | |
use Carp qw |confess croak|; | |
use Cwd qw |cwd|; | |
my @nodes = (); | |
use Getopt::Std; | |
our($opt_d, $opt_t, $opt_q, $opt_f, $opt_g, $opt_G, $opt_R); # t for taxids only (for ITOL) | |
getopts('d:tqf:g:GR'); | |
my $idx_dir = $opt_d || cwd(); | |
my ($nodefile,$namesfile) = ('nodes.dmp','names.dmp'); | |
my $db; | |
if (-f $idx_dir.$nodefile and -f $idx_dir.$namesfile) { | |
$db = new Bio::DB::Taxonomy(-source => 'flatfile', | |
-nodesfile =>$idx_dir. $nodefile, | |
-namesfile => $idx_dir.$namesfile); | |
} else { | |
warn "nodesfile or namesfile not found, using entrez online data\n"; | |
$db = new Bio::DB::Taxonomy((-source => 'entrez')); | |
} | |
#my $arthropod = $db->get_Taxonomy_Node(-name=>'Arthropoda'); | |
#my $lep = $db->get_Taxonomy_Node(-name=>'Lepeophtheirus'); | |
my %gi_taxa = (); | |
my @qnodes = (); | |
my @taxa = (); | |
### Allow reading taxon list from text file. One Taxon per line. | |
if ($opt_f) { | |
open TAX, $opt_f or die "couldn't open taxon file $!"; | |
while (<TAX>) { | |
chomp; | |
push @taxa, $_; | |
} | |
} else { | |
@taxa = @ARGV; | |
} | |
foreach (@taxa) { | |
my $node1; | |
if (/^\d+$/){ | |
$node1 = $db->get_Taxonomy_Node(-taxonid => $_); | |
} else { | |
s/[^\w\d\s]/ /g; $node1 = $db->get_Taxonomy_Node(-name => $_); | |
}# look for the name | |
die "$_ not found\n" unless ref $node1; | |
if ($opt_t) { | |
print ($node1->id. "\n"); | |
next; | |
} elsif ($opt_q) { | |
push @qnodes, "txid".$node1->id."[Organism]"; | |
if (scalar @qnodes == scalar @ARGV) { | |
print join (" OR ", @qnodes ), "\n"; | |
} | |
next; | |
} elsif ($opt_g) { | |
## make a gi list instead by storing the taxids in the hash | |
$gi_taxa{$node1->id} = (); | |
if ($opt_R) { | |
## add all descendents also: | |
map {$gi_taxa{$_->id} = ()} $node1->get_Descendents(); | |
} | |
next; | |
} else { | |
print "\n######################### $_ ##############################\n"; | |
unless (ref $node1) { (print " '$_' not found\n"); next; }; | |
my $tree1 = Bio::Tree::Tree->new(-node => $node1); | |
my @lineage = $tree1->get_lineage_nodes($node1); | |
@lineage = map {$_->scientific_name} @lineage; | |
my $line = join ':', @lineage; | |
my $phylum = $tree1->find_node(-rank => 'phylum'); | |
print "ID: ". $node1->id, " Sci.name: ", $node1->scientific_name, "\n [", join " ", ($node1->common_name), "]\n Phylum: ".(($phylum) ? $phylum->scientific_name : "NA"). "\n $line". "\n"; | |
push @nodes, $node1; | |
} | |
} | |
if ($opt_g) { | |
### open gi to tax mapping | |
open GI, $opt_g or die "couldn't open GI file $!"; | |
GI: while (<GI>) { | |
chomp; | |
## adapted to the 4-column format, but uses only the accessions | |
my ($acc, $accv, $tax, $gi) = split; | |
next GI unless exists $gi_taxa{$tax}; ## store only relevant taxa | |
push @{$gi_taxa{$tax}}, $acc; | |
} | |
print STDERR "read gis for ". scalar keys (%gi_taxa). " taxa.\n"; | |
foreach my $taxi (keys %gi_taxa) { | |
if ($opt_G) { | |
print "Found ". scalar @{$gi_taxa{$taxi}}. " for $taxi\n"; | |
} else { | |
print (join "\n", @{$gi_taxa{$taxi}},"\n") ; | |
} | |
} | |
} | |
exit unless @nodes >= 2 ; | |
my $tree = Bio::Tree::Tree->new(); | |
my $lca = $tree->get_lca(@nodes); | |
print "\n############################ LCA #################################\n"; | |
print "no LCA!" unless ref $lca; | |
print "LCA: ".$lca-> ncbi_taxid." ". $lca->scientific_name()."\n"; | |
# print "LCA-is subtree of Arthropoda: ".(allSubTax($arthropod, @nodes) ? "yes":"no"), "\n"; | |
#my $tree2 = Bio::Tree::Tree->new(-node=>$lca); | |
my @lineage = $tree->get_lineage_nodes($lca); | |
confess "empty lineage \n" unless @lineage; | |
my @line = map {$_->scientific_name} (@lineage,$lca); | |
print "Lineage of LCA: ", ( join ":", @line), "\n"; | |
sub allSubTax { | |
#my $taxname = shift; | |
my $tree = Bio::Tree::Tree->new(); | |
my $cPhylum = shift; | |
my $isCPhylum = $tree->get_lca(@_, $cPhylum); | |
return ($isCPhylum->id == $cPhylum->id); | |
} | |
__END__ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment