delagoya/two_bit_format_parsing_example.rb

## two_bit_format_parsing_example.rb
require 'rubygems'
require "bindata"

# A .2bit file stores multiple DNA sequences (up to 4 Gb total) in a compact
# randomly-accessible format. The file contains masking information as well as
# the DNA itself.
#
# The file begins with a 16-byte header containing the following fields:
#
# signature - the number 0x1A412743 in the architecture of the machine that
# created the file version - zero for now. Readers should abort if they see a
# version number higher than 0. sequenceCount - the number of sequences in the
# file. reserved - always zero for now All fields are 32 bits unless noted. If
# the signature value is not as given, the reader program should byte-swap the
# signature and check if the swapped version matches. If so, all multiple-byte
# entities in the file will have to be byte-swapped. This enables these binary
files to be used unchanged on different architectures.
class TwoBitHeader < BinData::Record
	endian :little
	uint32 :sig
	uint32 :ver
	uint32 :num
	uint32 :reserved_flag
end

# The header is followed by a file index, which contains one entry for each
# sequence. Each index entry contains three fields:
#
# nameSize - a byte containing the length of the name field name - the sequence
# name itself, of variable length depending on nameSize offset - the 32-bit
# offset of the sequence data relative to the start of the file
class TwoBitIndexEntry < BinData::Record
  endian :little
  int8 :len
  string :name, :read_length => :len
  uint32 :seq_offset
end

# The index is followed by the sequence records, which contain nine fields:
#
#   dnaSize - number of bases of DNA in the sequence
#   nBlockCount - the number of blocks of Ns in the file
#                 (representing unknown sequence)
#   nBlockStarts - the starting position for each block of Ns
#   nBlockSizes - the size of each block of Ns
#   maskBlockCount - the number of masked (lower-case) blocks
#   maskBlockStarts - the starting position for each masked block
#   maskBlockSizes - the size of each masked block
#   reserved - always zero for now
#   packedDna - the DNA packed to two bits per base, represented as so: T - 00,
#               C- 01, A - 10, G - 11. The first base is in the
#               most significant 2-bit byte; the last base is in the least
#               significant 2 bits. For example, the sequence TCAG is
#               represented as 00011011. The packedDna field is padded with 0
#               bits as necessary to take an even multiple of 32 bits
#               in the file, which improves I/O performance on some machines.

class TwoBitArray < BinData::Wrapper
  endian :little
  mandatory_parameter :len
  array :type => :uint32, :initial_length => :len
end

class TwoBitSequence < BinData::Wrapper
    endian :little
    mandatory_parameter :len
    array :type => :bit2, :initial_length => :len
end

class TwoBitSequenceEntry < BinData::Record
  endian :little
  uint32 :dna_size

  # N blocks
  uint32 :n_block_count
  two_bit_array :n_block_starts, :len => :n_block_count
  two_bit_array :n_block_sizes, :len => :n_block_count


  # masked blocks
  uint32 :m_block_count
  two_bit_array :m_block_starts, :len => :m_block_count
  two_bit_array :m_block_sizes, :len => :m_block_count

  uint32 :reserved
  # two_bit_sequence :sequence, :len => :dna_size
end

io = File.open(ARGV[0])

h = TwoBitHeader.read(io)
puts "HEADER : #{h.sig} | #{h.ver} | #{h.num} | #{h.reserved_flag }"

puts "So we have #{h.num} sequences here. Let's get the second sequence..."

first_entry = TwoBitIndexEntry.read(io)
puts "First sequence entry is #{first_entry.name} and is at file offset #{first_entry.seq_offset} ."
second_entry = TwoBitIndexEntry.read(io)
puts "Second sequence entry is #{second_entry.name} and is at file offset #{second_entry.seq_offset} ."


# Set the read position to the second entry's start position
io.pos = second_entry.seq_offset
# This is big, probably not a good way to do this...
seq_entry = TwoBitSequenceEntry.read(io)

puts "Second sequence is #{seq_entry.dna_size} bases."
puts "There are #{seq_entry.n_block_count} \"N\" blocks."
puts "The largest \"N\" block is #{seq_entry.n_block_sizes.max}"
puts "There are #{seq_entry.m_block_count} masked blocks."
puts "The largest masked block is #{seq_entry.m_block_sizes.max}"

puts "Reading first 10 bases. "
seq = TwoBitSequence.new(:len => 10)
seq.read(io)

puts "Sequence in 2bit format is: #{seq.map {|e| "%0.2d" % e.to_i.to_s(2)}.join " "}"

def convert_to_ascii(arr)
  bases = {0 => "T", 1 => "C", 2 => "A", 3 => "G" }
  arr.collect {|b| bases[b]}.join("")
end

puts "Converting that to ASCII is: " + convert_to_ascii(seq)
	require 'rubygems'
	require "bindata"

	# A .2bit file stores multiple DNA sequences (up to 4 Gb total) in a compact
	# randomly-accessible format. The file contains masking information as well as
	# the DNA itself.
	#
	# The file begins with a 16-byte header containing the following fields:
	#
	# signature - the number 0x1A412743 in the architecture of the machine that
	# created the file version - zero for now. Readers should abort if they see a
	# version number higher than 0. sequenceCount - the number of sequences in the
	# file. reserved - always zero for now All fields are 32 bits unless noted. If
	# the signature value is not as given, the reader program should byte-swap the
	# signature and check if the swapped version matches. If so, all multiple-byte
	# entities in the file will have to be byte-swapped. This enables these binary
	files to be used unchanged on different architectures.
	class TwoBitHeader < BinData::Record
	endian :little
	uint32 :sig
	uint32 :ver
	uint32 :num
	uint32 :reserved_flag
	end

	# The header is followed by a file index, which contains one entry for each
	# sequence. Each index entry contains three fields:
	#
	# nameSize - a byte containing the length of the name field name - the sequence
	# name itself, of variable length depending on nameSize offset - the 32-bit
	# offset of the sequence data relative to the start of the file
	class TwoBitIndexEntry < BinData::Record
	endian :little
	int8 :len
	string :name, :read_length => :len
	uint32 :seq_offset
	end

	# The index is followed by the sequence records, which contain nine fields:
	#
	# dnaSize - number of bases of DNA in the sequence
	# nBlockCount - the number of blocks of Ns in the file
	# (representing unknown sequence)
	# nBlockStarts - the starting position for each block of Ns
	# nBlockSizes - the size of each block of Ns
	# maskBlockCount - the number of masked (lower-case) blocks
	# maskBlockStarts - the starting position for each masked block
	# maskBlockSizes - the size of each masked block
	# reserved - always zero for now
	# packedDna - the DNA packed to two bits per base, represented as so: T - 00,
	# C- 01, A - 10, G - 11. The first base is in the
	# most significant 2-bit byte; the last base is in the least
	# significant 2 bits. For example, the sequence TCAG is
	# represented as 00011011. The packedDna field is padded with 0
	# bits as necessary to take an even multiple of 32 bits
	# in the file, which improves I/O performance on some machines.

	class TwoBitArray < BinData::Wrapper
	endian :little
	mandatory_parameter :len
	array :type => :uint32, :initial_length => :len
	end

	class TwoBitSequence < BinData::Wrapper
	endian :little
	mandatory_parameter :len
	array :type => :bit2, :initial_length => :len
	end

	class TwoBitSequenceEntry < BinData::Record
	endian :little
	uint32 :dna_size

	# N blocks
	uint32 :n_block_count
	two_bit_array :n_block_starts, :len => :n_block_count
	two_bit_array :n_block_sizes, :len => :n_block_count


	# masked blocks
	uint32 :m_block_count
	two_bit_array :m_block_starts, :len => :m_block_count
	two_bit_array :m_block_sizes, :len => :m_block_count

	uint32 :reserved
	# two_bit_sequence :sequence, :len => :dna_size
	end

	io = File.open(ARGV[0])

	h = TwoBitHeader.read(io)
	puts "HEADER : #{h.sig} \| #{h.ver} \| #{h.num} \| #{h.reserved_flag }"

	puts "So we have #{h.num} sequences here. Let's get the second sequence..."

	first_entry = TwoBitIndexEntry.read(io)
	puts "First sequence entry is #{first_entry.name} and is at file offset #{first_entry.seq_offset} ."
	second_entry = TwoBitIndexEntry.read(io)
	puts "Second sequence entry is #{second_entry.name} and is at file offset #{second_entry.seq_offset} ."


	# Set the read position to the second entry's start position
	io.pos = second_entry.seq_offset
	# This is big, probably not a good way to do this...
	seq_entry = TwoBitSequenceEntry.read(io)

	puts "Second sequence is #{seq_entry.dna_size} bases."
	puts "There are #{seq_entry.n_block_count} \"N\" blocks."
	puts "The largest \"N\" block is #{seq_entry.n_block_sizes.max}"
	puts "There are #{seq_entry.m_block_count} masked blocks."
	puts "The largest masked block is #{seq_entry.m_block_sizes.max}"

	puts "Reading first 10 bases. "
	seq = TwoBitSequence.new(:len => 10)
	seq.read(io)

	puts "Sequence in 2bit format is: #{seq.map {\|e\| "%0.2d" % e.to_i.to_s(2)}.join " "}"

	def convert_to_ascii(arr)
	bases = {0 => "T", 1 => "C", 2 => "A", 3 => "G" }
	arr.collect {\|b\| bases[b]}.join("")
	end

	puts "Converting that to ASCII is: " + convert_to_ascii(seq)