Created
April 15, 2013 01:18
-
-
Save zcourts/d86d91b1148371910b96 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package info.crlog.bloomfilter; | |
/** | |
* <h1>Introduction</h1> | |
* A BitSet is a vector used by the {@link BloomFilter} to mark used locations. | |
* Since Java doesn't have a "bit" data type this class provides methods for manipulating | |
* the bits within a Java {@link Byte}. Java's bytes are signed which imposes a -127 + 128 limit. | |
* We work around this by controlling the MSB and interpreting it as just another bit in an 8-bit | |
* structure. | |
* <p/> | |
* Basic required knowledge: | |
* Base 2 increments... | |
* 1,2,4,8,16,32,64,128 | |
* Base 2 conversion table for a byte | |
* <p/> | |
* <p/> | |
* <h1>How it works</h1> | |
* <p/> | |
* A byte array is initialized with all values set to 0. | |
* A Java Array is limited to {@link Integer#MAX_VALUE}, which works out to roughly 2GB. ordinarily this imposes | |
* a limit of around that size, to get around this however, this implementation splits into "buckets". | |
* <p/> | |
* A bucket is by default {@link Integer#MAX_VALUE}/6 ~= 256MB (refer to as {@link #BUCKET_SIZE}) and the number of buckets | |
* {@link #TOTAL_BUCKETS} itself is also limited to this max value and by default also set to {@link Integer#MAX_VALUE}/6. | |
* Effectively forming the basis of a circular buffer (except there's no wrapping). | |
* <p/> | |
* When a hash function generates an integer, this integer is the index at which a bit should be set to | |
* 1. To workout where this falls we need to calculate which bucket the index is in. We divide the index | |
* by {@link #TOTAL_BUCKETS} * 8 (total bits in a byte), the result is the bucket the index belongs to. | |
* <p/> | |
* We then need the location of the byte of interest in the bucket. For further documentation see {@link #set(long, byte)} | |
* <pre> | |
* 128,64,32,16,8,4,2,1 | |
* 1 1 1 1 1 1 1 1 | |
* -------------------- | |
* Sum() = 255 unsigned | |
* Sum() = -128 | 127 signed | |
* | |
* Sample code which shows the gist of what this class does...for my own sanity and yours :-) | |
* {@code | |
* public class Example { | |
* public static void main(String... args) { | |
* byte val = 0; | |
* System.out.println(val); | |
* //set to 10 | |
* val |= 1 << 3; | |
* val |= 1 << 1; | |
* System.out.println(val); | |
* if ((val & (1 << 3)) != 0) | |
* System.out.println("Bit 3 set"); | |
* if ((val & (1 << 1)) != 0) | |
* System.out.println("Bit 1 set"); | |
* //unset back to 0 | |
* val &= ~(1 << 3); | |
* val &= ~(1 << 1); | |
* System.out.println(val); | |
* if ((val & (1 << 3)) == 0) | |
* System.out.println("Bit 3 NOT set"); | |
* if ((val & (1 << 1)) == 0) | |
* System.out.println("Bit 1 NOT set"); | |
* } | |
* } | |
* } | |
* </pre> | |
* | |
* @author Courtney Robinson <courtney@crlog.info> | |
*/ | |
public class BitSet { | |
public static final int TOTAL_BITS_IN_A_BYTE = 8; | |
public final int TOTAL_BUCKETS; | |
public final int BUCKET_SIZE; | |
public final int TOTAL_BITS_IN_A_BUCKET; | |
public final int SIZE; | |
protected final byte[][] data; | |
public BitSet() { | |
this(Integer.MAX_VALUE / 6, Integer.MAX_VALUE / 6); | |
} | |
/** | |
* given the total buckets and the size of each bucket, compose a bit set capable of inserting up to | |
* total buckets * bucket size | |
* | |
* @param totalBuckets number of buckets (rows) | |
* @param bucketSize bucket size (columns) | |
*/ | |
public BitSet(final int totalBuckets, final int bucketSize) { | |
this.TOTAL_BUCKETS = totalBuckets; | |
this.BUCKET_SIZE = bucketSize; | |
this.TOTAL_BITS_IN_A_BUCKET = bucketSize * TOTAL_BITS_IN_A_BYTE; | |
SIZE = totalBuckets * TOTAL_BITS_IN_A_BUCKET; | |
data = new byte[totalBuckets][bucketSize]; | |
} | |
/** | |
* compose a bit set with bucket size = 10 | |
* | |
* @param totalBits max number of bits this holds | |
*/ | |
public BitSet(long totalBits) { | |
this(totalBits, 10); | |
} | |
/** | |
* Compose a bit set of (totalbits/buckets,buckets) | |
* | |
* @param totalBits max number of bits this holds | |
* @param buckets number of buckets to split those bits accross | |
*/ | |
public BitSet(long totalBits, int buckets) { | |
this((int) totalBits / buckets, buckets); | |
} | |
/** | |
* Sets the bit at the given index to 1 | |
* | |
* @param index the index to set to 1 | |
* @return true if the index is within the range of this instance. If the index is | |
* greater than or equal to {@link #TOTAL_BUCKETS} * {@link #BUCKET_SIZE} it returns false | |
*/ | |
public boolean set(long index) { | |
return set(index, (byte) 1); | |
} | |
/** | |
* Sets the bit at the given index to 0 | |
* | |
* @param index the index to set to 0 | |
* @return true if the index is within the range of this instance. If the index is | |
* greater than or equal to {@link #TOTAL_BUCKETS} * {@link #BUCKET_SIZE} it returns false | |
*/ | |
public boolean unset(long index) { | |
return set(index, (byte) 0); | |
} | |
/** | |
* Given an index, x, find the bucket (bt) it belongs to. | |
* <p/> | |
* Find the byte (b) it belongs to in the said bucket | |
* <p/> | |
* Find the bit (bi) within the byte that the index matches. A bit being the smallest unit of concern | |
* <p/> | |
* There are n buckets of sizes m with a byte having constant a size of 8 (o). | |
* <p/> | |
* To identify the bucket calculate how many bits are in each bucket then divide the index by the results. | |
* \(bt= x/ m * o) | |
* <p/> | |
* To identify which byte in the bucket we need take the total bits per bucket and divide by the index. | |
* <p/> | |
* This time we need the remainder which tells us the the byte IN BITS. To get the byte, convert this result from | |
* bits to bytes by dividing by 8 (o). | |
* <p/> | |
* Similarly the bit of interest within the byte just found is the remainder of | |
* the previous calculation times 8 (previous result was converted to bytes but this time we need it in bits), minus 1. | |
* (0 based index). | |
* <p/> | |
* Which can also be obtained by taking the remainder of the index divided by 8 (o) total bits in a byte, minus 1 | |
* (0 based index). | |
* <p/> | |
* Sets the bit at the given index to 1 or 0 | |
* | |
* @param value the value to set the bit to must be 1 or 0 anything else and this returns false | |
* @param index the index to set to value | |
* @return true if the index is within the range of this instance. If the index is | |
* greater than or equal to {@link #TOTAL_BUCKETS} * {@link #BUCKET_SIZE} it returns false | |
*/ | |
public boolean set(long index, byte value) { | |
if (isInRange(index) && (value == 1 || value == 0)) { | |
//find out which bucket this index belongs to | |
//guaranteed to be less than max val of int so safe to cast | |
//find the bucket that contains the byte of interest | |
int bucket = (int) index / TOTAL_BITS_IN_A_BUCKET; | |
int indexOverBits = (int) (index % TOTAL_BITS_IN_A_BUCKET); | |
//find the byte (or the index thereof) within the bucket | |
int bucketIndex = indexOverBits / TOTAL_BITS_IN_A_BYTE; | |
//Now find the bit within the byte, 0 based indexes in a byte so take 1 off | |
byte bitIndex = (byte) ((indexOverBits % TOTAL_BITS_IN_A_BYTE) - 1);// or ((index % TOTAL_BITS_IN_A_BYTE) - 1); | |
if (value == 1) { | |
//set the bit at the given bitIndex to 1 | |
data[bucket][bucketIndex] |= 1 << bitIndex; | |
} else { | |
//set the bit at the given bitIndex to 0 | |
data[bucket][bucketIndex] &= ~(1 << bitIndex); | |
} | |
return true; | |
} | |
return false; | |
} | |
/** | |
* See the documentation for {@link #set(long, byte)} for the logic. | |
* | |
* @param index the index to look up | |
* @return Gets the bit at the given index. If it is 1 return true, otherwise false | |
*/ | |
public boolean isSet(long index) { | |
if (isInRange(index)) { | |
int bucket = (int) index / TOTAL_BITS_IN_A_BUCKET; | |
int indexOverBits = (int) (index % TOTAL_BITS_IN_A_BUCKET); | |
int bucketIndex = indexOverBits / TOTAL_BITS_IN_A_BYTE; | |
byte bitIndex = (byte) ((indexOverBits % TOTAL_BITS_IN_A_BYTE) - 1); | |
return (data[bucket][bucketIndex] & (1 << bitIndex)) != 0; | |
} | |
return false; | |
} | |
/** | |
* Checks whether a given index/hash is less than the size this instance allows. | |
* | |
* @param index the index to check | |
* @return true if index is less than size | |
*/ | |
public boolean isInRange(final long index) { | |
return index < size(); | |
} | |
/** | |
* @return The maximum hash/index this {@link BitSet} can hold | |
*/ | |
public int size() { | |
return SIZE; | |
} | |
/** | |
* WARNING: This is provided purely for testing. Modifying the returned data at your own risk. | |
* | |
* @return the 2 dimensional byte array whose bits represents the indexes/hashes used in this {@link BitSet} | |
*/ | |
public byte[][] data() { | |
return data; | |
} | |
@Override | |
public String toString() { | |
return "BitSet{" + | |
"SIZE=" + SIZE + | |
", TOTAL_BITS_IN_A_BUCKET=" + TOTAL_BITS_IN_A_BUCKET + | |
", BUCKET_SIZE=" + BUCKET_SIZE + | |
", TOTAL_BUCKETS=" + TOTAL_BUCKETS + | |
'}'; | |
} | |
@Override | |
public boolean equals(final Object o) { | |
if (this == o) return true; | |
if (!(o instanceof BitSet)) return false; | |
final BitSet set = (BitSet) o; | |
if (BUCKET_SIZE != set.BUCKET_SIZE) return false; | |
if (SIZE != set.SIZE) return false; | |
if (TOTAL_BITS_IN_A_BUCKET != set.TOTAL_BITS_IN_A_BUCKET) return false; | |
if (TOTAL_BUCKETS != set.TOTAL_BUCKETS) return false; | |
return true; | |
} | |
@Override | |
public int hashCode() { | |
int result = TOTAL_BUCKETS; | |
result = 31 * result + BUCKET_SIZE; | |
result = 31 * result + TOTAL_BITS_IN_A_BUCKET; | |
result = 31 * result + SIZE; | |
return result; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment