Last active
October 5, 2015 07:58
-
-
Save rklemme/2775600 to your computer and use it in GitHub Desktop.
Basis for a bit set which uses String as storage. Storage is split up into multiple String instances to avoid too large allocations.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
# Implementation of a bit set which stores membershio in binary | |
# Strings managed in chunks. | |
class BitSet | |
ZERO = "\000".encode(Encoding::BINARY).freeze | |
DEFAULT_SEGMENT_SIZE = 1024 * 1024 | |
# Create the BitSet instance. If max_num is given, we preallocate | |
# as much memory as needed. | |
# | |
# segment_size (optional): size of individual chunks, defaults to 1MB | |
# max_num (optional): highest number we want to store in the set | |
def initialize(segment_size = DEFAULT_SEGMENT_SIZE, max_num = nil) | |
@seg_size = segment_size | |
if max_num | |
# caller asks for preallocation | |
seg, off, idx = ix(max_num) | |
segment = nil | |
@dat = Array.new(seg + 1) do |si| | |
if si < seg | |
# leading segments | |
if segment | |
# all after first segment | |
segment.dup | |
else | |
# first segment: create and remember | |
segment = ZERO * @seg_size | |
end | |
else | |
# last segment, where max_num would be stored | |
ZERO * (off + 1) | |
end | |
end | |
else | |
# default case: all empty | |
@dat = [] | |
end | |
end | |
def [](index) | |
seg, off, idx = ix(index) | |
d = @dat[seg] and | |
off < d.bytesize && (d.getbyte(off) & (1 << idx)) != 0 | |
end | |
def []=(index, flag) | |
seg, off, idx = ix(index) | |
if flag | |
# set | |
d = dat(seg, off) | |
mask = 1 << idx | |
d.setbyte(off, d.getbyte(off) | mask) | |
else | |
# reset | |
d = @dat[seg] | |
if d && d.bytesize > off | |
mask = 1 << idx | |
d.setbyte(off, (d.getbyte(off) | mask) ^ mask) | |
end | |
end | |
end | |
def clear | |
# TODO: loosing preallocation here | |
@dat.clear | |
self | |
end | |
def empty? | |
@dat.all? {|seg| seg.nil? || /[^\000]/ !~ seg} | |
end | |
include Enumerable | |
def each | |
return enum_for(:each) unless block_given? | |
@dat.each_with_index do |dat, seg| | |
dat and dat.bytesize.times do |off| | |
s = dat.getbyte(off) | |
if s != 0 | |
base = (seg * @seg_size + off) * 8 | |
8.times do |idx| | |
yield base + idx if (s & (1 << idx)) != 0 | |
end | |
end | |
end | |
end | |
self | |
end | |
private | |
def dat(segment, offset) | |
d = @dat[segment] | |
offset += 1 | |
if d | |
add = offset - d.bytesize | |
d << (ZERO * add) if add > 0 | |
d | |
else | |
@dat[segment] = ZERO * offset | |
end | |
end | |
def ix(idx) | |
raise "Negative index: #{idx.inspect}" if idx < 0 | |
len, midx = idx.divmod 8 | |
len.divmod(@seg_size) << midx | |
end | |
end | |
if $0 == __FILE__ | |
require 'test/unit' | |
class BitSetTest < Test::Unit::TestCase | |
CHUNK = 1024 | |
def setup | |
@bs = BitSet.new CHUNK | |
end | |
def test_init_empty | |
assert @bs.empty?, 'created is not empty' | |
refute @bs[0], 'created not empty' | |
assert_pattern 0, 0, 'created non empty' | |
seen = false | |
@bs.each {|*a| seen = true} | |
assert !seen, 'iteration on empty shows values' | |
end | |
def test_set | |
assert_pattern 0, 0 | |
@bs[0] = 1 | |
assert_pattern 0, 1, 'one bit set' | |
@bs[7] = 1 | |
assert_pattern 0, 1 | 1 << 7, 'two bits set' | |
@bs[8] = 1 | |
assert_pattern 1, 1, 'one bit set' | |
@bs[9] = true | |
assert_pattern 1, 3, 'two bits set' | |
end | |
def test_unset | |
assert_pattern 0, 0 | |
refute @bs[0] | |
@bs[0] = true | |
assert @bs[0] | |
@bs[0] = false | |
refute @bs[0] | |
@bs[0] = true | |
assert @bs[0] | |
@bs[0] = nil | |
refute @bs[0] | |
@bs[88] = true | |
assert @bs[88] | |
@bs[88] = false | |
refute @bs[88] | |
end | |
def test_iter | |
r = Random.new 0 | |
idx = Array.new(20) { r.rand(CHUNK / 2).tap {|i| @bs[i] = true} } | |
idx.each {|i| assert @bs[i], "iter failed at #{i}" } | |
idx2 = idx.dup | |
@bs.each {|i| assert idx2.delete(i), "visited wrong item #{i}"} | |
assert_empty idx2, 'not all visited' | |
end | |
def test_chunk | |
idx = CHUNK * 8 | |
refute @bs[idx] | |
@bs[idx] = true | |
assert @bs[idx] | |
idx += 83 | |
refute @bs[idx] | |
@bs[idx] = true | |
assert @bs[idx] | |
end | |
def test_large | |
r = Random.new 0 | |
idx = Array.new(20) { r.rand(CHUNK * 32).tap {|i| @bs[i] = true} } | |
idx.each {|i| assert @bs[i], "iter failed at #{i}" } | |
end | |
def test_prealloc | |
@bs = BitSet.new CHUNK, CHUNK * 3 + 47 | |
assert @bs.empty?, 'created is not empty' | |
test_init_empty | |
end | |
def assert_pattern offset, value, msg = nil | |
8.times do |i| | |
send(value[i] == 1 ? :assert : :refute, @bs[(offset << 3) + i], msg) | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment