jots/pure_ruby_cdb.rb

## pure_ruby_cdb.rb
#!/usr/bin/env ruby

require "pp"

def cdb_hash(s)
  r = 5381
  # & 0xffffffff forces to 32 bit. (lots of hair for that)
  s.split("").each { |c| r = (((r << 5) + r) ^ c.ord) & 0xffffffff }
  return r
end

CDBFILENAME = "pure_ruby_cdb.cdb"

def writecdb(ofn=CDBFILENAME,infn="rubycdb.txt",sep="\t")
  f = File.open(ofn,"wb")
  # leave room for header pointers
  p = 256*8
  f.seek(p)
  buckets = []; 256.times {|x| buckets[x] = [] }
  # open "rb" is very important for UTF8!!
  File.open(infn,"rb").each do |line|
    line.chomp!
    k,v = line.split(sep)
    f.write [klen=k.length,vlen=v.length].pack("LL")
    f.write k + v
    h = cdb_hash(k)
    buckets[h % 256] << [h,p]
    p += 8+klen+vlen
  end
  hashpos = p

  # create the hash data
  buckets.each do |buck|
    next if (blen = buck.length) == 0
    ncells = blen*2
    cells = []
    ncells.times {|x| cells[x] = [0,0] }
    buck.each do |h,p|
      i = (h >> 8) % ncells
      while cells[i][1] != 0 # in use... collision!
        i = (i+1) % ncells
      end
      cells[i] = [h,p]
    end
    # now write them out.
    cells.each do |c|
      f.write c.pack("LL")
    end
  end

  # write the header
  f.seek(0)
  buckets.each_with_index do |buck,x|
    f.write [hashpos,buck.length*2].pack("LL")
    hashpos += (buck.length*2)*8
  end
  f.close
end


class CDBReader
  BPOS = 0
  NCELLS = 1

  def initialize(fn=CDBFILENAME)
    @f = File.open(fn,"rb")
    @r = []
    # keep pointers handy
    256.times do |x|
      @r[x] = @f.read(8).unpack("LL")
    end
  end

  def get(k)
    h = cdb_hash(k)
    buck_pos,ncells = @r[h % 256]
    return "" if ncells == 0 # XXX no hit

    start = (h >> 8) % ncells
    ncells.times do |i|
      @f.seek(buck_pos + ((start+i) % ncells)*8)
      h1,p1 = @f.read(8).unpack("LL")
      return "" if p1 == 0 # XXX no hit
      if h1 == h
        @f.seek(p1)
        klen,vlen = @f.read(8).unpack("LL")
        k1 = @f.read(klen)
        return @f.read(vlen) if k1 == k
      end
    end
    return "" # XXX no hit
  end

end

if __FILE__ == $0
  # create test file
  WORDS = "/usr/share/dict/words"
  f = File.open("words.txt","wb")
  File.open(WORDS,"rb").each do |line|
    line.chomp!
    f.puts [line,line.length].join("\t")
  end
  f.close

  # create cdb file.
  writecdb(CDBFILENAME,"words.txt")
  # create cdb file using native cdb..
  system("cdb -m -c native_cdb.cdb words.txt")
  # compare output
  system("cmp -l #{CDBFILENAME} native_cdb.cdb")
end
	#!/usr/bin/env ruby

	require "pp"

	def cdb_hash(s)
	r = 5381
	# & 0xffffffff forces to 32 bit. (lots of hair for that)
	s.split("").each { \|c\| r = (((r << 5) + r) ^ c.ord) & 0xffffffff }
	return r
	end

	CDBFILENAME = "pure_ruby_cdb.cdb"

	def writecdb(ofn=CDBFILENAME,infn="rubycdb.txt",sep="\t")
	f = File.open(ofn,"wb")
	# leave room for header pointers
	p = 256*8
	f.seek(p)
	buckets = []; 256.times {\|x\| buckets[x] = [] }
	# open "rb" is very important for UTF8!!
	File.open(infn,"rb").each do \|line\|
	line.chomp!
	k,v = line.split(sep)
	f.write [klen=k.length,vlen=v.length].pack("LL")
	f.write k + v
	h = cdb_hash(k)
	buckets[h % 256] << [h,p]
	p += 8+klen+vlen
	end
	hashpos = p

	# create the hash data
	buckets.each do \|buck\|
	next if (blen = buck.length) == 0
	ncells = blen*2
	cells = []
	ncells.times {\|x\| cells[x] = [0,0] }
	buck.each do \|h,p\|
	i = (h >> 8) % ncells
	while cells[i][1] != 0 # in use... collision!
	i = (i+1) % ncells
	end
	cells[i] = [h,p]
	end
	# now write them out.
	cells.each do \|c\|
	f.write c.pack("LL")
	end
	end

	# write the header
	f.seek(0)
	buckets.each_with_index do \|buck,x\|
	f.write [hashpos,buck.length*2].pack("LL")
	hashpos += (buck.length2)8
	end
	f.close
	end


	class CDBReader
	BPOS = 0
	NCELLS = 1

	def initialize(fn=CDBFILENAME)
	@f = File.open(fn,"rb")
	@r = []
	# keep pointers handy
	256.times do \|x\|
	@r[x] = @f.read(8).unpack("LL")
	end
	end

	def get(k)
	h = cdb_hash(k)
	buck_pos,ncells = @r[h % 256]
	return "" if ncells == 0 # XXX no hit

	start = (h >> 8) % ncells
	ncells.times do \|i\|
	@f.seek(buck_pos + ((start+i) % ncells)*8)
	h1,p1 = @f.read(8).unpack("LL")
	return "" if p1 == 0 # XXX no hit
	if h1 == h
	@f.seek(p1)
	klen,vlen = @f.read(8).unpack("LL")
	k1 = @f.read(klen)
	return @f.read(vlen) if k1 == k
	end
	end
	return "" # XXX no hit
	end

	end

	if __FILE__ == $0
	# create test file
	WORDS = "/usr/share/dict/words"
	f = File.open("words.txt","wb")
	File.open(WORDS,"rb").each do \|line\|
	line.chomp!
	f.puts [line,line.length].join("\t")
	end
	f.close

	# create cdb file.
	writecdb(CDBFILENAME,"words.txt")
	# create cdb file using native cdb..
	system("cdb -m -c native_cdb.cdb words.txt")
	# compare output
	system("cmp -l #{CDBFILENAME} native_cdb.cdb")
	end