Skip to content

Instantly share code, notes, and snippets.

@zcourts
Created April 15, 2013 01:18
Show Gist options
  • Save zcourts/d86d91b1148371910b96 to your computer and use it in GitHub Desktop.
Save zcourts/d86d91b1148371910b96 to your computer and use it in GitHub Desktop.
package info.crlog.bloomfilter;
/**
* <h1>Introduction</h1>
* A BitSet is a vector used by the {@link BloomFilter} to mark used locations.
* Since Java doesn't have a "bit" data type this class provides methods for manipulating
* the bits within a Java {@link Byte}. Java's bytes are signed which imposes a -127 + 128 limit.
* We work around this by controlling the MSB and interpreting it as just another bit in an 8-bit
* structure.
* <p/>
* Basic required knowledge:
* Base 2 increments...
* 1,2,4,8,16,32,64,128
* Base 2 conversion table for a byte
* <p/>
* <p/>
* <h1>How it works</h1>
* <p/>
* A byte array is initialized with all values set to 0.
* A Java Array is limited to {@link Integer#MAX_VALUE}, which works out to roughly 2GB. ordinarily this imposes
* a limit of around that size, to get around this however, this implementation splits into "buckets".
* <p/>
* A bucket is by default {@link Integer#MAX_VALUE}/6 ~= 256MB (refer to as {@link #BUCKET_SIZE}) and the number of buckets
* {@link #TOTAL_BUCKETS} itself is also limited to this max value and by default also set to {@link Integer#MAX_VALUE}/6.
* Effectively forming the basis of a circular buffer (except there's no wrapping).
* <p/>
* When a hash function generates an integer, this integer is the index at which a bit should be set to
* 1. To workout where this falls we need to calculate which bucket the index is in. We divide the index
* by {@link #TOTAL_BUCKETS} * 8 (total bits in a byte), the result is the bucket the index belongs to.
* <p/>
* We then need the location of the byte of interest in the bucket. For further documentation see {@link #set(long, byte)}
* <pre>
* 128,64,32,16,8,4,2,1
* 1 1 1 1 1 1 1 1
* --------------------
* Sum() = 255 unsigned
* Sum() = -128 | 127 signed
*
* Sample code which shows the gist of what this class does...for my own sanity and yours :-)
* {@code
* public class Example {
* public static void main(String... args) {
* byte val = 0;
* System.out.println(val);
* //set to 10
* val |= 1 << 3;
* val |= 1 << 1;
* System.out.println(val);
* if ((val & (1 << 3)) != 0)
* System.out.println("Bit 3 set");
* if ((val & (1 << 1)) != 0)
* System.out.println("Bit 1 set");
* //unset back to 0
* val &= ~(1 << 3);
* val &= ~(1 << 1);
* System.out.println(val);
* if ((val & (1 << 3)) == 0)
* System.out.println("Bit 3 NOT set");
* if ((val & (1 << 1)) == 0)
* System.out.println("Bit 1 NOT set");
* }
* }
* }
* </pre>
*
* @author Courtney Robinson <courtney@crlog.info>
*/
public class BitSet {
public static final int TOTAL_BITS_IN_A_BYTE = 8;
public final int TOTAL_BUCKETS;
public final int BUCKET_SIZE;
public final int TOTAL_BITS_IN_A_BUCKET;
public final int SIZE;
protected final byte[][] data;
public BitSet() {
this(Integer.MAX_VALUE / 6, Integer.MAX_VALUE / 6);
}
/**
* given the total buckets and the size of each bucket, compose a bit set capable of inserting up to
* total buckets * bucket size
*
* @param totalBuckets number of buckets (rows)
* @param bucketSize bucket size (columns)
*/
public BitSet(final int totalBuckets, final int bucketSize) {
this.TOTAL_BUCKETS = totalBuckets;
this.BUCKET_SIZE = bucketSize;
this.TOTAL_BITS_IN_A_BUCKET = bucketSize * TOTAL_BITS_IN_A_BYTE;
SIZE = totalBuckets * TOTAL_BITS_IN_A_BUCKET;
data = new byte[totalBuckets][bucketSize];
}
/**
* compose a bit set with bucket size = 10
*
* @param totalBits max number of bits this holds
*/
public BitSet(long totalBits) {
this(totalBits, 10);
}
/**
* Compose a bit set of (totalbits/buckets,buckets)
*
* @param totalBits max number of bits this holds
* @param buckets number of buckets to split those bits accross
*/
public BitSet(long totalBits, int buckets) {
this((int) totalBits / buckets, buckets);
}
/**
* Sets the bit at the given index to 1
*
* @param index the index to set to 1
* @return true if the index is within the range of this instance. If the index is
* greater than or equal to {@link #TOTAL_BUCKETS} * {@link #BUCKET_SIZE} it returns false
*/
public boolean set(long index) {
return set(index, (byte) 1);
}
/**
* Sets the bit at the given index to 0
*
* @param index the index to set to 0
* @return true if the index is within the range of this instance. If the index is
* greater than or equal to {@link #TOTAL_BUCKETS} * {@link #BUCKET_SIZE} it returns false
*/
public boolean unset(long index) {
return set(index, (byte) 0);
}
/**
* Given an index, x, find the bucket (bt) it belongs to.
* <p/>
* Find the byte (b) it belongs to in the said bucket
* <p/>
* Find the bit (bi) within the byte that the index matches. A bit being the smallest unit of concern
* <p/>
* There are n buckets of sizes m with a byte having constant a size of 8 (o).
* <p/>
* To identify the bucket calculate how many bits are in each bucket then divide the index by the results.
* \(bt= x/ m * o)
* <p/>
* To identify which byte in the bucket we need take the total bits per bucket and divide by the index.
* <p/>
* This time we need the remainder which tells us the the byte IN BITS. To get the byte, convert this result from
* bits to bytes by dividing by 8 (o).
* <p/>
* Similarly the bit of interest within the byte just found is the remainder of
* the previous calculation times 8 (previous result was converted to bytes but this time we need it in bits), minus 1.
* (0 based index).
* <p/>
* Which can also be obtained by taking the remainder of the index divided by 8 (o) total bits in a byte, minus 1
* (0 based index).
* <p/>
* Sets the bit at the given index to 1 or 0
*
* @param value the value to set the bit to must be 1 or 0 anything else and this returns false
* @param index the index to set to value
* @return true if the index is within the range of this instance. If the index is
* greater than or equal to {@link #TOTAL_BUCKETS} * {@link #BUCKET_SIZE} it returns false
*/
public boolean set(long index, byte value) {
if (isInRange(index) && (value == 1 || value == 0)) {
//find out which bucket this index belongs to
//guaranteed to be less than max val of int so safe to cast
//find the bucket that contains the byte of interest
int bucket = (int) index / TOTAL_BITS_IN_A_BUCKET;
int indexOverBits = (int) (index % TOTAL_BITS_IN_A_BUCKET);
//find the byte (or the index thereof) within the bucket
int bucketIndex = indexOverBits / TOTAL_BITS_IN_A_BYTE;
//Now find the bit within the byte, 0 based indexes in a byte so take 1 off
byte bitIndex = (byte) ((indexOverBits % TOTAL_BITS_IN_A_BYTE) - 1);// or ((index % TOTAL_BITS_IN_A_BYTE) - 1);
if (value == 1) {
//set the bit at the given bitIndex to 1
data[bucket][bucketIndex] |= 1 << bitIndex;
} else {
//set the bit at the given bitIndex to 0
data[bucket][bucketIndex] &= ~(1 << bitIndex);
}
return true;
}
return false;
}
/**
* See the documentation for {@link #set(long, byte)} for the logic.
*
* @param index the index to look up
* @return Gets the bit at the given index. If it is 1 return true, otherwise false
*/
public boolean isSet(long index) {
if (isInRange(index)) {
int bucket = (int) index / TOTAL_BITS_IN_A_BUCKET;
int indexOverBits = (int) (index % TOTAL_BITS_IN_A_BUCKET);
int bucketIndex = indexOverBits / TOTAL_BITS_IN_A_BYTE;
byte bitIndex = (byte) ((indexOverBits % TOTAL_BITS_IN_A_BYTE) - 1);
return (data[bucket][bucketIndex] & (1 << bitIndex)) != 0;
}
return false;
}
/**
* Checks whether a given index/hash is less than the size this instance allows.
*
* @param index the index to check
* @return true if index is less than size
*/
public boolean isInRange(final long index) {
return index < size();
}
/**
* @return The maximum hash/index this {@link BitSet} can hold
*/
public int size() {
return SIZE;
}
/**
* WARNING: This is provided purely for testing. Modifying the returned data at your own risk.
*
* @return the 2 dimensional byte array whose bits represents the indexes/hashes used in this {@link BitSet}
*/
public byte[][] data() {
return data;
}
@Override
public String toString() {
return "BitSet{" +
"SIZE=" + SIZE +
", TOTAL_BITS_IN_A_BUCKET=" + TOTAL_BITS_IN_A_BUCKET +
", BUCKET_SIZE=" + BUCKET_SIZE +
", TOTAL_BUCKETS=" + TOTAL_BUCKETS +
'}';
}
@Override
public boolean equals(final Object o) {
if (this == o) return true;
if (!(o instanceof BitSet)) return false;
final BitSet set = (BitSet) o;
if (BUCKET_SIZE != set.BUCKET_SIZE) return false;
if (SIZE != set.SIZE) return false;
if (TOTAL_BITS_IN_A_BUCKET != set.TOTAL_BITS_IN_A_BUCKET) return false;
if (TOTAL_BUCKETS != set.TOTAL_BUCKETS) return false;
return true;
}
@Override
public int hashCode() {
int result = TOTAL_BUCKETS;
result = 31 * result + BUCKET_SIZE;
result = 31 * result + TOTAL_BITS_IN_A_BUCKET;
result = 31 * result + SIZE;
return result;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment