MrJoy/bwt.rb

## bwt.rb
#!/usr/bin/env ruby
require 'rubygems'
require 'bundler/setup'
Bundler.setup(:default, :development)

###############################################################################
# Burrows-Wheeler Transform
###############################################################################
class BurrowsWheelerTransform
  BLOCK_SIZE = 256
  OVERHEAD = 1 # Don't forget to make this be the number of bytes in an
               # unsigned numeric type adequate to express a number in the
               # range specified above (I.E. 0..(BLOCK_SIZE-1)).

  #############################################################################
  # Forward Transform
  #############################################################################
  def transform(corpus)
    # Step 1: Permutations.
    rotations = []
    l = corpus.length
    l.times do |idx|
      rotations << [idx, corpus]
      corpus = corpus.rotate(-1)
    end

    # Step 2: Lexicographical sorting.
    rotations = rotations.sort { |a,b| a[1] <=> b[1] }

    # Step 3: Make a string from the last character from each string, and
    #         note the index that has permutation zero.
    l = rotations.map { |rot| rot[1][-1] }
    i = rotations.map(&:first).find_index(0)

    return i, l
  end

  #############################################################################
  # Reverse Transform
  #############################################################################
  def untransform(i, l)
    # Step 1: Build a table counting # of preceding symbols matching symbol in
    #         current position.
    table1 = []
    l.each_with_index do |symbol, idx|
      table1 << [
        symbol,
        (l[0..idx].select { |symbol2| symbol2 == symbol }.count - 1)
      ]
    end

    # Step 2: For each unique symbol in L, the number of symbols that are
    #         lexicographically less than that symbol.
    table2_raw = l.uniq.sort.map do |symbol|
      [symbol, l.select { |symbol2| symbol2 < symbol }.count]
    end
    table2 = Hash[table2_raw]

    # Step 3: Now, begin reconstructing things backwards...
    corpus = []
    corpus << l[i] # Last character of the string comes from rotation 0
                   # (S[0]), I.E. where we hadn't rotated anything yet.
                   # I.E. we just found C[n-1]...

    # Step 4: Find C[n-2], which is the contribution from S[n-1], but where in
    #         L is that?!  It just so happens that S[n-1] is the string that
    #         starts with C[n-1] and we just found C[n-1]. So we know how
    #         S[n-1] starts, which can help us find out where it is in the sort
    #         order.
    #
    #         The contributions from strings starting with symbols less than
    #         C[n-1] will occur earlier in L than the contribution from S[n-1].
    #         L may also contain contributions from strings starting with
    #         symbols equal to C[n-1]. Some of these strings occur before
    #         S[n-1] and others after.
    #
    #         From table1, we know the number of symbols identical to C[n-1]
    #         that occur in L before C[n-1]. So we also know the number of
    #         strings starting with identical symbols that made contributions
    #         before S[n-1].
    #
    #         From table2, we know how many symbols there are that are less
    #         than C[n-1]. So we also know the number of strings starting with
    #         lesser symbols that made contributions before S[n-1].
    #
    #         No other strings may contribute to L before S[n-1], therefore the
    #         contribution from S[n-1] occurs in the position obtained by
    #         summing the values from table1 and table2. As stated earlier the
    #         contribution from S[n-1] is C[n-2].
    #
    #         Use the same technique to find C[n-3], C[n-4], ..., C[0].
    aa = i
    countdown = l.length - 1
    while(countdown > 0)
      a1 = table1[aa][-1]
      a2 = table2[corpus[-1]]
      aa = a1 + a2
      corpus << table1[aa][0]
      countdown -= 1
    end

    return corpus.reverse
  end
end

## bwt_decode.rb
#!/usr/bin/env ruby
require './bwt'

block_size = BurrowsWheelerTransform::BLOCK_SIZE
overhead_amount = BurrowsWheelerTransform::OVERHEAD
overhead_token = case overhead_amount
  when 1
    "C"
  when 2
    "s>"
  end
bwt = BurrowsWheelerTransform.new
File.open(ARGV[0], "rb") do |in_fh|
  File.open(ARGV[0].sub(/\.bwt$/, ''), "wb") do |out_fh|
    while(chunk = in_fh.read(block_size + overhead_amount))
      l = chunk.unpack("#{overhead_token}C*")
      i = l.shift
      out_fh.write(bwt.untransform(i, l).map(&:chr).join)
    end
  end
end

## bwt_encode.rb
#!/usr/bin/env ruby
require './bwt'

block_size = BurrowsWheelerTransform::BLOCK_SIZE
overhead = case BurrowsWheelerTransform::OVERHEAD
  when 1
    "C"
  when 2
    "s>"
  end
bwt = BurrowsWheelerTransform.new
File.open(ARGV[0], "rb") do |in_fh|
  File.open("#{ARGV[0]}.bwt", "wb") do |out_fh|
    while(chunk = in_fh.read(block_size))
      (i, l) = bwt.transform(chunk.bytes.to_a)
      out_fh.write([i, *l].pack("#{overhead}C*"))
    end
  end
end
	#!/usr/bin/env ruby
	require 'rubygems'
	require 'bundler/setup'
	Bundler.setup(:default, :development)

	###############################################################################
	# Burrows-Wheeler Transform
	###############################################################################
	class BurrowsWheelerTransform
	BLOCK_SIZE = 256
	OVERHEAD = 1 # Don't forget to make this be the number of bytes in an
	# unsigned numeric type adequate to express a number in the
	# range specified above (I.E. 0..(BLOCK_SIZE-1)).

	#############################################################################
	# Forward Transform
	#############################################################################
	def transform(corpus)
	# Step 1: Permutations.
	rotations = []
	l = corpus.length
	l.times do \|idx\|
	rotations << [idx, corpus]
	corpus = corpus.rotate(-1)
	end

	# Step 2: Lexicographical sorting.
	rotations = rotations.sort { \|a,b\| a[1] <=> b[1] }

	# Step 3: Make a string from the last character from each string, and
	# note the index that has permutation zero.
	l = rotations.map { \|rot\| rot[1][-1] }
	i = rotations.map(&:first).find_index(0)

	return i, l
	end

	#############################################################################
	# Reverse Transform
	#############################################################################
	def untransform(i, l)
	# Step 1: Build a table counting # of preceding symbols matching symbol in
	# current position.
	table1 = []
	l.each_with_index do \|symbol, idx\|
	table1 << [
	symbol,
	(l[0..idx].select { \|symbol2\| symbol2 == symbol }.count - 1)
	]
	end

	# Step 2: For each unique symbol in L, the number of symbols that are
	# lexicographically less than that symbol.
	table2_raw = l.uniq.sort.map do \|symbol\|
	[symbol, l.select { \|symbol2\| symbol2 < symbol }.count]
	end
	table2 = Hash[table2_raw]

	# Step 3: Now, begin reconstructing things backwards...
	corpus = []
	corpus << l[i] # Last character of the string comes from rotation 0
	# (S[0]), I.E. where we hadn't rotated anything yet.
	# I.E. we just found C[n-1]...

	# Step 4: Find C[n-2], which is the contribution from S[n-1], but where in
	# L is that?! It just so happens that S[n-1] is the string that
	# starts with C[n-1] and we just found C[n-1]. So we know how
	# S[n-1] starts, which can help us find out where it is in the sort
	# order.
	#
	# The contributions from strings starting with symbols less than
	# C[n-1] will occur earlier in L than the contribution from S[n-1].
	# L may also contain contributions from strings starting with
	# symbols equal to C[n-1]. Some of these strings occur before
	# S[n-1] and others after.
	#
	# From table1, we know the number of symbols identical to C[n-1]
	# that occur in L before C[n-1]. So we also know the number of
	# strings starting with identical symbols that made contributions
	# before S[n-1].
	#
	# From table2, we know how many symbols there are that are less
	# than C[n-1]. So we also know the number of strings starting with
	# lesser symbols that made contributions before S[n-1].
	#
	# No other strings may contribute to L before S[n-1], therefore the
	# contribution from S[n-1] occurs in the position obtained by
	# summing the values from table1 and table2. As stated earlier the
	# contribution from S[n-1] is C[n-2].
	#
	# Use the same technique to find C[n-3], C[n-4], ..., C[0].
	aa = i
	countdown = l.length - 1
	while(countdown > 0)
	a1 = table1[aa][-1]
	a2 = table2[corpus[-1]]
	aa = a1 + a2
	corpus << table1[aa][0]
	countdown -= 1
	end

	return corpus.reverse
	end
	end
	#!/usr/bin/env ruby
	require './bwt'

	block_size = BurrowsWheelerTransform::BLOCK_SIZE
	overhead_amount = BurrowsWheelerTransform::OVERHEAD
	overhead_token = case overhead_amount
	when 1
	"C"
	when 2
	"s>"
	end
	bwt = BurrowsWheelerTransform.new
	File.open(ARGV[0], "rb") do \|in_fh\|
	File.open(ARGV[0].sub(/\.bwt$/, ''), "wb") do \|out_fh\|
	while(chunk = in_fh.read(block_size + overhead_amount))
	l = chunk.unpack("#{overhead_token}C*")
	i = l.shift
	out_fh.write(bwt.untransform(i, l).map(&:chr).join)
	end
	end
	end