ktym/biogitax.rb

## biogitax.rb
#!/usr/bin/env ruby-1.9

ENV["BIO_GITAX_DB"] = "#{ENV['HOME']}/ncbi/taxonomy/gitax"

require "rubygems"
require "open-uri"
require "fileutils"

begin
  require "kyotocabinet"
rescue LoadError
  puts <<INSTALL_KYOTO_CABINET

Please install Kyoto Cabinet available at http://fallabs.com/kyotocabinet/
before using this program.

1. Install the C library from http://fallabs.com/kyotocabinet/pkg/
   by running "./configure; make; make install"

2. Then, install the Ruby library from http://fallabs.com/kyotocabinet/rubypkg/
   by running "ruby extconf.rb; make; make install"

INSTALL_KYOTO_CABINET

  exit 1
end

def help
  puts <<HELP

Synopsis:

  Classify lines in the given file by the given taxids and summarize
  the taxonomic distribution. Useful for clastering the BLAST results
  into the givein clades etc.

    % biogitax file taxid1 [ taxid2 taxid3 ... ] > file.filter 2> file.summary

Prerequirement:

  You need to install Kyoto Cabinet (http://fallabs.com/kyotocabinet/) before
  using this program.

Database preparation:

  Download taxonomy data from NCBI.

    % biogitax download

  Create NCBI GI <-> Taxonomy conversion tables.

    % biogitax create

  You need to run these steps only once (or when you want to update the DBs).

Environmental variable:

  By default, the files will be stored under the "./gitax" directory. You can
  specify the data directory by the environmental variable "BIO_GITAX_DB".
  This is useful to avoid having multiple copies of the data files and to
  setup one system wide installation of the database.

    # for B shell
    % export BIO_GITAX_DB="/usr/local/bio/gitax"

    # for C shell
    % setenv BIO_GITAX_DB "/usr/local/bio/gitax"

Filtering procedure:

  You can filter any files containing one NCBI GI number in each line.
  The numbers matching /^\d+/ or /gi\|(\d+)/i or /gi:(\d+)/i in each line
  will be considered as a GI number and converted into the corresponding
  NCBI Taxonomy ID (taxid). Then, the taxid is examined whether it belongs
  to the given list of clades (also indicated by the taxids).

  The taxids will be checked in the given order and the last match will be
  used. So, if you want to classify Arthropoda (6656) into Insecta (50557),
  Diplura (29997) and other Arthropoda, you can sort the taxids from broader
  clade to narrower clade such as:

    % biogitax file 6656 50557 29997 > file.filter 2> file.summary

  STDOUT:
    Each line will be prefixed with the given clade or
    "Other" (not in the given list of taxids of clades) or
    "Unknown" (no corresponding taxid was found in the database).

  STDERR:
    Print summary of numbers in each given clade.

Examples:

  Prepare the taxonomy databases

    % biogitax download
    % biogitax create

  Run filtering

    % biogitax file taxid1 taxid2 taxid3  > file.filter 2> file.summary

HELP
end

class GiTaxDB

  def initialize(gitax_dir)
    @dir = gitax_dir
    @tree = {}
    @path = []
  end

  def getfile(prefix, filename)
    FileUtils.mkdir_p(@dir)
    $stderr.puts "Downloading #{prefix}/#{filename} ..."
    open("#{prefix}/#{filename}") do |file|
      File.open("#{@dir}/#{filename}", "w") do |data|
        data.print file.read
      end
#      time = file.last_modified
#      File.utime(time, time, "#{@dir}/#{filename}")
    end
  end

  def untargz(filename)
    Dir.chdir(@dir) do
      system("tar zxf #{filename}")
    end
  end

  def ungzip(filename)
    Dir.chdir(@dir) do
      system("gunzip #{filename}")
    end
  end

  def download_ncbi_taxonomy_files
    prefix = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy"
    filename = "taxdump.tar.gz"
    getfile(prefix, filename)
    untargz(filename)
    filename = "gi_taxid_nucl.dmp.gz"
    getfile(prefix, filename)
    ungzip(filename)
    filename = "gi_taxid_prot.dmp.gz"
    getfile(prefix, filename)
    ungzip(filename)
    $stderr.puts "The files are stored under the #{@dir}"
  end

  def tree(db, node)
    @path << node
    db[node] = @path.join('/')
    if @tree[node]
      @tree[node].each do |child|
        tree(db, child)
      end
    end
    @path.pop
  end

  def create_kyoto_cabinet_indices
    # db_gi_taxid: key => gi, val => taxid (leaf)
    # db_tax_name: key => taxid (leaf), val => name
    # db_tax_path: key => taxid (leaf), val => [ taxid (path) ].join("/")

    db_gi_taxid = KyotoCabinet::DB.new
    db_tax_name = KyotoCabinet::DB.new
    db_tax_path = KyotoCabinet::DB.new

    param1 = "#bnum=100000000#msiz=512m"
    param2 = "#bnum=10000000#msiz=512m"
    omode = KyotoCabinet::DB::OWRITER|KyotoCabinet::DB::OCREATE

    FileUtils.rm_f("#{@dir}/db_gi_taxid.kch")
    db_gi_taxid.open("#{@dir}/db_gi_taxid.kch#{param1}", omode)
    FileUtils.rm_f("#{@dir}/db_tax_name.kch")
    db_tax_name.open("#{@dir}/db_tax_name.kch#{param2}", omode)
    FileUtils.rm_f("#{@dir}/db_tax_path.kch")
    db_tax_path.open("#{@dir}/db_tax_path.kch#{param2}", omode)

=begin
    # Takes so long time. Probably needs parameter tuning.
    $stderr.puts "Processing #{@dir}/gi_taxid_nucl.dmp file ..."
    File.open("#{@dir}/gi_taxid_nucl.dmp").each do |line|
      gi, taxid = line.strip.split("\t")
      db_gi_taxid[gi] = taxid
    end
=end

    # 40 min on MBA...
    $stderr.puts "Processing #{@dir}/gi_taxid_prot.dmp file ..."
    File.open("#{@dir}/gi_taxid_prot.dmp").each do |line|
      gi, taxid = line.strip.split("\t")
      db_gi_taxid[gi] = taxid
    end

    # < 1 min on MBA
    $stderr.puts "Processing #{@dir}/names.dmp file ..."
    File.open("#{@dir}/names.dmp").each do |line|
      taxid, _, name, = line.strip.split("\t")
      unless db_tax_name[taxid]
        db_tax_name[taxid] = name
      end
    end

    # < 1 min on MBA
    $stderr.puts "Processing #{@dir}/nodes.dmp file ..."
    File.open("#{@dir}/nodes.dmp").each do |line|
      taxid, _, ptaxid, = line.split("\t")
      @tree[ptaxid] ||= Array.new
      @tree[ptaxid] << taxid unless taxid == ptaxid
    end

    tree(db_tax_path, "1")  # traverse from the root (all) node

    db_gi_taxid.close
    db_tax_name.close
    db_tax_path.close

    $stderr.puts "Index files are created under the #{@dir}"
  end

  def open_kyoto_cabinet_indices
    @db_gi_taxid = KyotoCabinet::DB.new
    @db_tax_name = KyotoCabinet::DB.new
    @db_tax_path = KyotoCabinet::DB.new

    omode = KyotoCabinet::DB::OREADER

    @db_gi_taxid.open("#{@dir}/db_gi_taxid.kch", omode)
    @db_tax_name.open("#{@dir}/db_tax_name.kch", omode)
    @db_tax_path.open("#{@dir}/db_tax_path.kch", omode)
  end

  def filter_file_with_taxid_list(file, list)
    open_kyoto_cabinet_indices

    clade_count = Hash.new(0)
    clade_name = {}
    clade_regexp = {}
    list.each do |clade|
      clade_name[clade] = @db_tax_name[clade]
      clade_regexp[clade] = %r[\/#{clade}(\/|$)]
    end
    File.open(file).each do |line|
      case line
      when /^\d+/
        gi = line[/^\d+/]
      when /gi\|\d+/i
        gi = line[/gi\|(\d+)/, 1]
      when /gi:\d+/i
        gi = line[/gi:(\d+)/, 1]
      else
        puts line
        next
      end
      if taxid = @db_gi_taxid[gi]
        name = @db_tax_name[taxid]
        path = @db_tax_path[taxid]
        match = false
        list.each do |clade|
          if path and path.match(clade_regexp[clade])
            # this overrides previous match, so list should sorted broader to narrower
            match = clade
          end
        end
        if match
          clade_count[match] += 1
          print "#{clade_name[match]} - #{name} (#{match})\t"
        else
          clade_count[:Other] += 1
          print "Other - #{name} (#{taxid})\t"
        end
      else
        clade_count[:Unknown] += 1
        print "Unknown\t"
      end
      puts line
    end
    $stderr.print "# "
    list.each do |clade|
      $stderr.print "#{clade_name[clade]} (#{clade})\t"
    end
    $stderr.puts "Other\tUnknown"
    list.each do |clade|
      $stderr.print "#{clade_count[clade]}\t"
    end
    $stderr.puts "#{clade_count[:Other]}\t#{clade_count[:Unknown]}"
  end
end


gitax_dir = ENV["BIO_GITAX_DB"] || "./gitax"
command = ARGV.shift
gitaxdb = GiTaxDB.new(gitax_dir)

case command
when "download"
  gitaxdb.download_ncbi_taxonomy_files
when "create"
  gitaxdb.create_kyoto_cabinet_indices
when "help"
  help
else
  if file = command
    list = ARGV.dup
    gitaxdb.filter_file_with_taxid_list(file, list)
  else
    help
  end
end


=begin
% head -100000 gi_taxid_prot.dmp > test100000.txt
% time ruby-1.9 bin/biogitax test100000.txt 6656 50557 29997 > test100000.txt.filter 2> test100000.txt.summary
% ruby-1.9 bin/biogitax test100000.txt 6656 50557 29997 > test100000.txt.filter  3.00s user 2.31s system 31% cpu 17.026 total

Archea (2157)
Bacteria (2)
Eukaryota (2759)
Metazoa (33208)
Deuterostomia (33511)
Protostomia (33317)
Ecdysozoa (1206794)
Arthropoda (6656)
Nematoda (6231)
Onychophora (27563)
Tardigrada (42241)

% biogitax file 2157 2 2759 33208 33511 33317 1206794 6656 6231 27563 42241
=end
	#!/usr/bin/env ruby-1.9

	ENV["BIO_GITAX_DB"] = "#{ENV['HOME']}/ncbi/taxonomy/gitax"

	require "rubygems"
	require "open-uri"
	require "fileutils"

	begin
	require "kyotocabinet"
	rescue LoadError
	puts <<INSTALL_KYOTO_CABINET

	Please install Kyoto Cabinet available at http://fallabs.com/kyotocabinet/
	before using this program.

	1. Install the C library from http://fallabs.com/kyotocabinet/pkg/
	by running "./configure; make; make install"

	2. Then, install the Ruby library from http://fallabs.com/kyotocabinet/rubypkg/
	by running "ruby extconf.rb; make; make install"

	INSTALL_KYOTO_CABINET

	exit 1
	end

	def help
	puts <<HELP

	Synopsis:

	Classify lines in the given file by the given taxids and summarize
	the taxonomic distribution. Useful for clastering the BLAST results
	into the givein clades etc.

	% biogitax file taxid1 [ taxid2 taxid3 ... ] > file.filter 2> file.summary

	Prerequirement:

	You need to install Kyoto Cabinet (http://fallabs.com/kyotocabinet/) before
	using this program.

	Database preparation:

	Download taxonomy data from NCBI.

	% biogitax download

	Create NCBI GI <-> Taxonomy conversion tables.

	% biogitax create

	You need to run these steps only once (or when you want to update the DBs).

	Environmental variable:

	By default, the files will be stored under the "./gitax" directory. You can
	specify the data directory by the environmental variable "BIO_GITAX_DB".
	This is useful to avoid having multiple copies of the data files and to
	setup one system wide installation of the database.

	# for B shell
	% export BIO_GITAX_DB="/usr/local/bio/gitax"

	# for C shell
	% setenv BIO_GITAX_DB "/usr/local/bio/gitax"

	Filtering procedure:

	You can filter any files containing one NCBI GI number in each line.
	The numbers matching /^\d+/ or /gi\\|(\d+)/i or /gi:(\d+)/i in each line
	will be considered as a GI number and converted into the corresponding
	NCBI Taxonomy ID (taxid). Then, the taxid is examined whether it belongs
	to the given list of clades (also indicated by the taxids).

	The taxids will be checked in the given order and the last match will be
	used. So, if you want to classify Arthropoda (6656) into Insecta (50557),
	Diplura (29997) and other Arthropoda, you can sort the taxids from broader
	clade to narrower clade such as:

	% biogitax file 6656 50557 29997 > file.filter 2> file.summary

	STDOUT:
	Each line will be prefixed with the given clade or
	"Other" (not in the given list of taxids of clades) or
	"Unknown" (no corresponding taxid was found in the database).

	STDERR:
	Print summary of numbers in each given clade.

	Examples:

	Prepare the taxonomy databases

	% biogitax download
	% biogitax create

	Run filtering

	% biogitax file taxid1 taxid2 taxid3 > file.filter 2> file.summary

	HELP
	end

	class GiTaxDB

	def initialize(gitax_dir)
	@dir = gitax_dir
	@tree = {}
	@path = []
	end

	def getfile(prefix, filename)
	FileUtils.mkdir_p(@dir)
	$stderr.puts "Downloading #{prefix}/#{filename} ..."
	open("#{prefix}/#{filename}") do \|file\|
	File.open("#{@dir}/#{filename}", "w") do \|data\|
	data.print file.read
	end
	# time = file.last_modified
	# File.utime(time, time, "#{@dir}/#{filename}")
	end
	end

	def untargz(filename)
	Dir.chdir(@dir) do
	system("tar zxf #{filename}")
	end
	end

	def ungzip(filename)
	Dir.chdir(@dir) do
	system("gunzip #{filename}")
	end
	end

	def download_ncbi_taxonomy_files
	prefix = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy"
	filename = "taxdump.tar.gz"
	getfile(prefix, filename)
	untargz(filename)
	filename = "gi_taxid_nucl.dmp.gz"
	getfile(prefix, filename)
	ungzip(filename)
	filename = "gi_taxid_prot.dmp.gz"
	getfile(prefix, filename)
	ungzip(filename)
	$stderr.puts "The files are stored under the #{@dir}"
	end

	def tree(db, node)
	@path << node
	db[node] = @path.join('/')
	if @tree[node]
	@tree[node].each do \|child\|
	tree(db, child)
	end
	end
	@path.pop
	end

	def create_kyoto_cabinet_indices
	# db_gi_taxid: key => gi, val => taxid (leaf)
	# db_tax_name: key => taxid (leaf), val => name
	# db_tax_path: key => taxid (leaf), val => [ taxid (path) ].join("/")

	db_gi_taxid = KyotoCabinet::DB.new
	db_tax_name = KyotoCabinet::DB.new
	db_tax_path = KyotoCabinet::DB.new

	param1 = "#bnum=100000000#msiz=512m"
	param2 = "#bnum=10000000#msiz=512m"
	omode = KyotoCabinet::DB::OWRITER\|KyotoCabinet::DB::OCREATE

	FileUtils.rm_f("#{@dir}/db_gi_taxid.kch")
	db_gi_taxid.open("#{@dir}/db_gi_taxid.kch#{param1}", omode)
	FileUtils.rm_f("#{@dir}/db_tax_name.kch")
	db_tax_name.open("#{@dir}/db_tax_name.kch#{param2}", omode)
	FileUtils.rm_f("#{@dir}/db_tax_path.kch")
	db_tax_path.open("#{@dir}/db_tax_path.kch#{param2}", omode)

	=begin
	# Takes so long time. Probably needs parameter tuning.
	$stderr.puts "Processing #{@dir}/gi_taxid_nucl.dmp file ..."
	File.open("#{@dir}/gi_taxid_nucl.dmp").each do \|line\|
	gi, taxid = line.strip.split("\t")
	db_gi_taxid[gi] = taxid
	end
	=end

	# 40 min on MBA...
	$stderr.puts "Processing #{@dir}/gi_taxid_prot.dmp file ..."
	File.open("#{@dir}/gi_taxid_prot.dmp").each do \|line\|
	gi, taxid = line.strip.split("\t")
	db_gi_taxid[gi] = taxid
	end

	# < 1 min on MBA
	$stderr.puts "Processing #{@dir}/names.dmp file ..."
	File.open("#{@dir}/names.dmp").each do \|line\|
	taxid, _, name, = line.strip.split("\t")
	unless db_tax_name[taxid]
	db_tax_name[taxid] = name
	end
	end

	# < 1 min on MBA
	$stderr.puts "Processing #{@dir}/nodes.dmp file ..."
	File.open("#{@dir}/nodes.dmp").each do \|line\|
	taxid, _, ptaxid, = line.split("\t")
	@tree[ptaxid] \|\|= Array.new
	@tree[ptaxid] << taxid unless taxid == ptaxid
	end

	tree(db_tax_path, "1") # traverse from the root (all) node

	db_gi_taxid.close
	db_tax_name.close
	db_tax_path.close

	$stderr.puts "Index files are created under the #{@dir}"
	end

	def open_kyoto_cabinet_indices
	@db_gi_taxid = KyotoCabinet::DB.new
	@db_tax_name = KyotoCabinet::DB.new
	@db_tax_path = KyotoCabinet::DB.new

	omode = KyotoCabinet::DB::OREADER

	@db_gi_taxid.open("#{@dir}/db_gi_taxid.kch", omode)
	@db_tax_name.open("#{@dir}/db_tax_name.kch", omode)
	@db_tax_path.open("#{@dir}/db_tax_path.kch", omode)
	end

	def filter_file_with_taxid_list(file, list)
	open_kyoto_cabinet_indices

	clade_count = Hash.new(0)
	clade_name = {}
	clade_regexp = {}
	list.each do \|clade\|
	clade_name[clade] = @db_tax_name[clade]
	clade_regexp[clade] = %r[\/#{clade}(\/\|$)]
	end
	File.open(file).each do \|line\|
	case line
	when /^\d+/
	gi = line[/^\d+/]
	when /gi\\|\d+/i
	gi = line[/gi\\|(\d+)/, 1]
	when /gi:\d+/i
	gi = line[/gi:(\d+)/, 1]
	else
	puts line
	next
	end
	if taxid = @db_gi_taxid[gi]
	name = @db_tax_name[taxid]
	path = @db_tax_path[taxid]
	match = false
	list.each do \|clade\|
	if path and path.match(clade_regexp[clade])
	# this overrides previous match, so list should sorted broader to narrower
	match = clade
	end
	end
	if match
	clade_count[match] += 1
	print "#{clade_name[match]} - #{name} (#{match})\t"
	else
	clade_count[:Other] += 1
	print "Other - #{name} (#{taxid})\t"
	end
	else
	clade_count[:Unknown] += 1
	print "Unknown\t"
	end
	puts line
	end
	$stderr.print "# "
	list.each do \|clade\|
	$stderr.print "#{clade_name[clade]} (#{clade})\t"
	end
	$stderr.puts "Other\tUnknown"
	list.each do \|clade\|
	$stderr.print "#{clade_count[clade]}\t"
	end
	$stderr.puts "#{clade_count[:Other]}\t#{clade_count[:Unknown]}"
	end
	end


	gitax_dir = ENV["BIO_GITAX_DB"] \|\| "./gitax"
	command = ARGV.shift
	gitaxdb = GiTaxDB.new(gitax_dir)

	case command
	when "download"
	gitaxdb.download_ncbi_taxonomy_files
	when "create"
	gitaxdb.create_kyoto_cabinet_indices
	when "help"
	help
	else
	if file = command
	list = ARGV.dup
	gitaxdb.filter_file_with_taxid_list(file, list)
	else
	help
	end
	end


	=begin
	% head -100000 gi_taxid_prot.dmp > test100000.txt
	% time ruby-1.9 bin/biogitax test100000.txt 6656 50557 29997 > test100000.txt.filter 2> test100000.txt.summary
	% ruby-1.9 bin/biogitax test100000.txt 6656 50557 29997 > test100000.txt.filter 3.00s user 2.31s system 31% cpu 17.026 total

	Archea (2157)
	Bacteria (2)
	Eukaryota (2759)
	Metazoa (33208)
	Deuterostomia (33511)
	Protostomia (33317)
	Ecdysozoa (1206794)
	Arthropoda (6656)
	Nematoda (6231)
	Onychophora (27563)
	Tardigrada (42241)

	% biogitax file 2157 2 2759 33208 33511 33317 1206794 6656 6231 27563 42241
	=end