Last active
August 29, 2015 14:00
-
-
Save Buttonwood/b61530fdea3952a93e70 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#============================================================================= | |
# FileName: add_taxonomy_info.pl | |
# Desc: Add taxonomy information for a blast nt/nr table file. | |
# Author: tanhao | |
# Email: tanhao2013@gmail.com | |
# HomePage: http://buttonwood.github.io | |
# Version: 0.0.1 | |
# LastChange: 2013-05-05 11:35:12 | |
# History: | |
#============================================================================= | |
#! /usr/bin/perl -w | |
use strict; | |
die("Usage:\n\tperl $0 blast.table gi_colum\n")unless(@ARGV >= 1); | |
my $dir = `dirname $0`; | |
chop($dir); | |
my %gi_hash; | |
my %gi_tax_hash; | |
my %tax_name_hash; | |
my %node_hash; | |
my $blast_result = "$ARGV[0]"; | |
my $gi_tax = "$dir/gi_taxid_nucl.dmp"; | |
my $tax_name = "$dir/names.dmp"; | |
my $node = "$dir/nodes.dmp"; | |
my $gi_colum = ($ARGV[1] - 1) || 1; | |
my $outfile = $blast_result + ".info"; | |
open (BL,"$blast_result") or die ("can not open $blast_result\n"); | |
while(<BL>){ | |
chomp; | |
my @line= split /\t/; | |
$gi_hash{$1} = $2 if( $line[$gi_colum] =~ /gi\|(\d+)\|.+\|*(.+)\|/); | |
} | |
close BL; | |
if ($gi_tax=~/\.gz$/){ | |
open (GT,"gzip -dc $gi_tax |") or die ("can not open $gi_tax\n"); | |
}else{ | |
open (GT,"$gi_tax ") or die ("can not open $gi_tax\n"); | |
} | |
while(<GT>){ | |
chomp; | |
my @line= split /\t/; | |
$gi_tax_hash{$line[0]} = $line[1] if(exists $gi_hash{$line[0]}); | |
} | |
close GT; | |
open (TN, "$tax_name"); | |
while(<TN>){ | |
chomp; | |
my @line= split /\t\|\t/; | |
$line[3] =~ s/\t\|//g; | |
$tax_name_hash{$line[0]} = $line[1] if($line[3] eq "scientific name"); | |
} | |
close TN; | |
open (NOD,"$node"); | |
while(<NOD>){ | |
chomp; | |
my @line = split /\t\|\t/; | |
$node_hash{$line[0]}= $line[1]; | |
} | |
close NOD; | |
open (BL,"$blast_result"); | |
open (NAM,">>$outfile"); | |
while(<BL>){ | |
chomp; | |
my @line = split /\t/; | |
$line[$gi_colum] =~ /gi\|(\d+)\|.+\|*(.+)\|/; | |
my $gi=$1; my $id= $2; | |
my $tax= $gi_tax_hash{$gi}; | |
my $pretax= $node_hash{$tax}; | |
if(!$tax){$tax=0; $pretax=1;} | |
my @alltax; | |
push @alltax, $tax; | |
while($pretax!=1){ | |
push @alltax, $pretax; | |
$pretax= $node_hash{$pretax}; | |
} | |
foreach my $ele (@line){ print NAM "$ele\t";} | |
foreach my $tele (@alltax){print NAM "$tax_name_hash{$tele}|";} | |
print NAM "\n"; | |
} | |
close BL; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
wget -r ftp://ftp.ncbi.nih.gov/pub/taxonomy/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@shinout You‘ve done a nice work!