Skip to content

Instantly share code, notes, and snippets.

@develar
Created November 30, 2021 16:53
Show Gist options
  • Save develar/974a587f8180e183ce25cf308ffae39a to your computer and use it in GitHub Desktop.
Save develar/974a587f8180e183ce25cf308ffae39a to your computer and use it in GitHub Desktop.
// Copyright 2000-2021 Jim Apple. Use of this source code is governed by the Apache 2.0 license that can be found in https://github.com/jbapple/libfilter/blob/master/LICENSE.txt.
package com.intellij.util.lang;
import java.nio.ByteBuffer;
/**
* BlockFilter is a block Bloom filter (from Putze et al.'s "Cache-, Hash- and
* Space-Efficient Bloom Filters") with some twists:
* <ol>
* <li> Each block is a split Bloom filter - see Section 2.1 of Broder and Mitzenmacher's
* "Network Applications of Bloom Filters: A Survey".
* <li> The number of bits set during each key addition is constant in order to try and
* take advantage of SIMD instructions.
* </ol>
*
* https://github.com/jbapple/libfilter
*/
@SuppressWarnings("CommentedOutCode")
public final class BlockFilter {
private final int[] payload;
private final int bucketCount;
public BlockFilter(int[] payload) {
this.payload = payload;
this.bucketCount = payload.length / 8;
}
public BlockFilter(int numberOfDistinctValues, double falsePositiveProbability) {
payload = new int[Math.max(8, (getBytesNeeded(numberOfDistinctValues, falsePositiveProbability) / 4) / 8 * 8)];
bucketCount = payload.length / 8;
}
public BlockFilter(ByteBuffer buffer) {
int size = buffer.getInt();
payload = new int[size];
buffer.asIntBuffer().get(payload);
buffer.position(buffer.position() + (payload.length * Integer.BYTES));
bucketCount = payload.length / 8;
}
public int sizeInBytes() {
return Integer.BYTES + (payload.length * Integer.BYTES);
}
public void write(ByteBuffer buffer) {
buffer.putInt(payload.length);
buffer.asIntBuffer().put(payload);
buffer.position(buffer.position() + (payload.length * Integer.BYTES));
}
/**
* Calculates the expected false positive probability from the size (in bytes) and fill
* factor (in number of distinct values, <code>ndv</code>).
*
* @param ndv the number of distinct values
* @param bytes the size of the filter
* @return the false positive probability
*/
private static double computeFalsePositiveProbability(double ndv, double bytes) {
double word_bits = 32;
double bucket_words = 8;
double hash_bits = 32;
// From equation 3 of Putze et al.
//
// m = bytes * CHAR_BIT
// n = ndv
// c = m / n
// b = m / (bucket_words * word_bits)
// B = n * c / b = n * (m / n) / (m / (bucket_words * word_bits)) = m / (m /
// (bucket_words * word_bits)) = bucket_words * word_bits k = number of hashes, in our
// case: bucket_words
if (ndv == 0) return 0.0;
if (bytes <= 0) return 1.0;
//noinspection PointlessArithmeticExpression
if (1.0 * ndv / (1.0 * bytes * 8) > 3) {
return 1.0;
}
double result = 0;
double lam = bucket_words * word_bits / ((bytes * 8) / ndv);
double logLam = Math.log(lam);
double log1collide = -hash_bits * Math.log(2.0);
int MAX_J = 10000;
for (int j = 0; j < MAX_J; ++j) {
int i = MAX_J - 1 - j;
double logP = i * logLam - lam - logGamma(i + 1);
double logFinner = bucket_words * Math.log(1.0 - Math.pow(1.0 - 1.0 / word_bits, i));
double logCollide = Math.log(i) + log1collide;
result += Math.exp(logP + logFinner) + Math.exp(logP + logCollide);
}
//noinspection ManualMinMaxCalculation
return (result > 1.0) ? 1.0 : result;
}
/**
* Calculates the number of bytes needed to create a filter with the given number of
* distinct values and false positive probability.
*
* @param ndv the number of distinct values
* @param fpp the desired false positive probability
* @return the number of bytes needed
*/
public static int getBytesNeeded(int ndv, double fpp) {
double word_bits = 32;
double bucket_words = 8;
int bucket_bytes = (int)((word_bits * bucket_words) / 8);
double result = 1;
while (computeFalsePositiveProbability(ndv, result) > fpp) {
result *= 2;
}
if (result <= bucket_bytes) return bucket_bytes;
double lo = 0;
while (lo + 1 < result) {
double mid = lo + (result - lo) / 2;
double test = computeFalsePositiveProbability(ndv, mid);
if (test < fpp) {
result = mid;
}
else if (test == fpp) {
result = ((mid + bucket_bytes - 1) / bucket_bytes) * bucket_bytes;
if (result > Integer.MAX_VALUE) return Integer.MAX_VALUE;
return (int)result;
}
else {
lo = mid;
}
}
result = ((result + bucket_bytes - 1) / bucket_bytes) * bucket_bytes;
if (result > Integer.MAX_VALUE) return Integer.MAX_VALUE;
return (int)result;
}
/**
* Calculates the number of distinct elements that can be stored in a filter of size
* <code>bytes</code> without exceeding the given false positive probability.
*
* @param bytes the size of the filter
* @param fpp the desired false positive probability
* @return the maximum number of distinct values that can be added
*/
public static double getTotalHashCapacity(double bytes, double fpp) {
double result = 1;
while (computeFalsePositiveProbability(result, bytes) < fpp) {
result *= 2;
}
if (result == 1) return 0;
double lo = 0;
while (lo + 1 < result) {
double mid = lo + (result - lo) / 2;
double test = computeFalsePositiveProbability(mid, bytes);
if (test < fpp) {
lo = mid;
}
else if (test == fpp) {
return mid;
}
else {
result = mid;
}
}
return lo;
}
private static final int[] INTERNAL_HASH_SEEDS = {0x47b6137b, 0x44974d91, 0x8824ad5b,
0xa2b7289d, 0x705495c7, 0x2df1424b, 0x9efc4947, 0x5c6bfb31};
private static int index(long hash, int num_buckets) {
return (int)(((hash >>> 32) * ((long)num_buckets)) >>> 32);
}
private static void makeMask(long hash, int[] result) {
// the loops are done individually for easier auto-vectorization by the JIT
for (int i = 0; i < 8; ++i) {
result[0] = (int)hash * INTERNAL_HASH_SEEDS[i];
}
for (int i = 0; i < 8; ++i) {
result[i] >>>= (32 - 5);
}
for (int i = 0; i < 8; ++i) {
result[i] = 1 << result[i];
}
}
/**
* Add a hash value to the filter.
* <p>
* Do not mix with <code>FindHash32</code> - a hash value that is present with
* <code>FindHash64(x)</code> will not be present when calling
* <code>FindHash32((int)x)</code>.
* <p>
* Do not pass values to this function, only their hashes.
* <p>
* <em>Hashes must be 64-bits</em>. 32-bit hashes will result in a useless filter where
* <code>FindHash64</code> always returns <code>true</code> for any input.
*
* @param hash the 64-bit hash value of the element you wish to insert
*/
public void addHash64(long hash) {
int bucketIndex = index(hash, bucketCount);
int[] mask = {0xca11, 8, 6, 7, 5, 3, 0, 9};
makeMask(hash, mask);
for (int i = 0; i < 8; ++i) {
payload[bucketIndex * 8 + i] = mask[i] | payload[bucketIndex * 8 + i];
}
}
/**
* Find a hash value in the filter.
* <p>
* Do not mix with <code>AddHash32</code> - a hash value that is added with
* <code>AddHash32</code> will not be present when calling <code>FindHash64</code>.
* <p>
* Do not pass values to this function, only their hashes.
* <p>
* <em>Hashes must be 64-bits</em>. 32-bit hashes will return incorrect results.
*
* @param hash the 64-bit hash value of the element you are checking the presence of
*/
public boolean mightContain(long hash) {
int bucketIndex = index(hash, bucketCount);
int[] mask = {0xca11, 8, 6, 7, 5, 3, 0, 9};
makeMask(hash, mask);
for (int i = 0; i < 8; ++i) {
if (0 == (payload[bucketIndex * 8 + i] & mask[i])) {
return false;
}
}
return true;
}
//private static final long REHASH_32 = (((long)0xd1012a3a) << 32) | (long)0x7a1f4a8a;
///**
// * Add a hash value to the filter.
// * <p>
// * Do not mix with <code>FindHash64</code> - a hash value that is present with
// * <code>FindHash64(x)</code> will not be present when calling
// * <code>FindHash32((int)x)</code>.
// *
// * @param hash the 32-bit hash value of the element you wish to insert
// */
//public boolean AddHash32(int hash) {
// long hash64 = (((REHASH_32 * (long)hash) >>> 32) << 32) | hash;
// int bucket_idx = index(hash64, payload.length / 8);
// int[] mask = {0xca11, 8, 6, 7, 5, 3, 0, 9};
// makeMask(hash, mask);
// for (int i = 0; i < 8; ++i) {
// payload[bucket_idx * 8 + i] = mask[i] | payload[bucket_idx * 8 + i];
// }
// return true;
//}
///**
// * Find a hash value in the filter.
// * <p>
// * Do not mix with <code>AddHash64</code> - a hash value that is added with
// * <code>AddHash64</code> will not be present when calling <code>FindHash32</code>.
// *
// * @param hash the 32-bit hash value of the element you are checking the presence of
// */
//public boolean FindHash32(int hash) {
// long hash64 = (((REHASH_32 * (long)hash) >>> 32) << 32) | hash;
// int bucket_idx = index(hash64, payload.length / 8);
// int[] mask = {0xca11, 8, 6, 7, 5, 3, 0, 9};
// makeMask(hash, mask);
// for (int i = 0; i < 8; ++i) {
// if (0 == (payload[bucket_idx * 8 + i] & mask[i])) {
// return false;
// }
// }
// return true;
//}
private static double logGamma(double x) {
double ret;
if (Double.isNaN(x) || (x <= 0.0)) {
ret = Double.NaN;
}
else if (x < 0.5) {
return logGamma1p(x) - Math.log(x);
}
else if (x <= 2.5) {
return logGamma1p((x - 0.5) - 0.5);
}
else if (x <= 8.0) {
final int n = (int)Math.floor(x - 1.5);
double prod = 1.0;
for (int i = 1; i <= n; i++) {
prod *= x - i;
}
return logGamma1p(x - (n + 1)) + Math.log(prod);
}
else {
double sum = lanczos(x);
double tmp = x + LANCZOS_G + .5;
ret = ((x + .5) * Math.log(tmp)) - tmp +
HALF_LOG_2_PI + Math.log(sum / x);
}
return ret;
}
@SuppressWarnings("DuplicatedCode")
private static double invGamma1pm1(final double x) {
if (x < -0.5) {
throw new RuntimeException("NumberIsTooSmallException");
}
if (x > 1.5) {
throw new RuntimeException("NumberIsTooLargeException");
}
final double ret;
final double t = x <= 0.5 ? x : (x - 0.5) - 0.5;
if (t < 0.0) {
final double a = INV_GAMMA1P_M1_A0 + t * INV_GAMMA1P_M1_A1;
double b = INV_GAMMA1P_M1_B8;
b = INV_GAMMA1P_M1_B7 + t * b;
b = INV_GAMMA1P_M1_B6 + t * b;
b = INV_GAMMA1P_M1_B5 + t * b;
b = INV_GAMMA1P_M1_B4 + t * b;
b = INV_GAMMA1P_M1_B3 + t * b;
b = INV_GAMMA1P_M1_B2 + t * b;
b = INV_GAMMA1P_M1_B1 + t * b;
b = 1.0 + t * b;
double c = INV_GAMMA1P_M1_C13 + t * (a / b);
c = INV_GAMMA1P_M1_C12 + t * c;
c = INV_GAMMA1P_M1_C11 + t * c;
c = INV_GAMMA1P_M1_C10 + t * c;
c = INV_GAMMA1P_M1_C9 + t * c;
c = INV_GAMMA1P_M1_C8 + t * c;
c = INV_GAMMA1P_M1_C7 + t * c;
c = INV_GAMMA1P_M1_C6 + t * c;
c = INV_GAMMA1P_M1_C5 + t * c;
c = INV_GAMMA1P_M1_C4 + t * c;
c = INV_GAMMA1P_M1_C3 + t * c;
c = INV_GAMMA1P_M1_C2 + t * c;
c = INV_GAMMA1P_M1_C1 + t * c;
c = INV_GAMMA1P_M1_C + t * c;
if (x > 0.5) {
ret = t * c / x;
}
else {
ret = x * ((c + 0.5) + 0.5);
}
}
else {
double p = INV_GAMMA1P_M1_P6;
p = INV_GAMMA1P_M1_P5 + t * p;
p = INV_GAMMA1P_M1_P4 + t * p;
p = INV_GAMMA1P_M1_P3 + t * p;
p = INV_GAMMA1P_M1_P2 + t * p;
p = INV_GAMMA1P_M1_P1 + t * p;
p = INV_GAMMA1P_M1_P0 + t * p;
double q = INV_GAMMA1P_M1_Q4;
q = INV_GAMMA1P_M1_Q3 + t * q;
q = INV_GAMMA1P_M1_Q2 + t * q;
q = INV_GAMMA1P_M1_Q1 + t * q;
q = 1.0 + t * q;
double c = INV_GAMMA1P_M1_C13 + (p / q) * t;
c = INV_GAMMA1P_M1_C12 + t * c;
c = INV_GAMMA1P_M1_C11 + t * c;
c = INV_GAMMA1P_M1_C10 + t * c;
c = INV_GAMMA1P_M1_C9 + t * c;
c = INV_GAMMA1P_M1_C8 + t * c;
c = INV_GAMMA1P_M1_C7 + t * c;
c = INV_GAMMA1P_M1_C6 + t * c;
c = INV_GAMMA1P_M1_C5 + t * c;
c = INV_GAMMA1P_M1_C4 + t * c;
c = INV_GAMMA1P_M1_C3 + t * c;
c = INV_GAMMA1P_M1_C2 + t * c;
c = INV_GAMMA1P_M1_C1 + t * c;
c = INV_GAMMA1P_M1_C0 + t * c;
if (x > 0.5) {
ret = (t / x) * ((c - 0.5) - 0.5);
}
else {
ret = x * c;
}
}
return ret;
}
private static double lanczos(final double x) {
double sum = 0.0;
for (int i = LANCZOS.length - 1; i > 0; --i) {
sum += (LANCZOS[i] / (x + i));
}
return sum + LANCZOS[0];
}
private static double logGamma1p(final double x) {
if (x < -0.5) {
throw new RuntimeException("NumberIsTooSmallException");
}
if (x > 1.5) {
throw new RuntimeException("NumberIsTooLargeException");
}
return -Math.log1p(invGamma1pm1(x));
}
private static final double HALF_LOG_2_PI = 0.5 * Math.log(2.0 * Math.PI);
private static final double LANCZOS_G = 607.0 / 128.0;
private static final double[] LANCZOS = {
0.99999999999999709182,
57.156235665862923517,
-59.597960355475491248,
14.136097974741747174,
-0.49191381609762019978,
.33994649984811888699e-4,
.46523628927048575665e-4,
-.98374475304879564677e-4,
.15808870322491248884e-3,
-.21026444172410488319e-3,
.21743961811521264320e-3,
-.16431810653676389022e-3,
.84418223983852743293e-4,
-.26190838401581408670e-4,
.36899182659531622704e-5,
};
private static final double INV_GAMMA1P_M1_A0 = .611609510448141581788E-08;
private static final double INV_GAMMA1P_M1_A1 = .624730830116465516210E-08;
private static final double INV_GAMMA1P_M1_B1 = .203610414066806987300E+00;
private static final double INV_GAMMA1P_M1_B2 = .266205348428949217746E-01;
private static final double INV_GAMMA1P_M1_B3 = .493944979382446875238E-03;
private static final double INV_GAMMA1P_M1_B4 = -.851419432440314906588E-05;
private static final double INV_GAMMA1P_M1_B5 = -.643045481779353022248E-05;
private static final double INV_GAMMA1P_M1_B6 = .992641840672773722196E-06;
private static final double INV_GAMMA1P_M1_B7 = -.607761895722825260739E-07;
private static final double INV_GAMMA1P_M1_B8 = .195755836614639731882E-09;
private static final double INV_GAMMA1P_M1_P0 = .6116095104481415817861E-08;
private static final double INV_GAMMA1P_M1_P1 = .6871674113067198736152E-08;
private static final double INV_GAMMA1P_M1_P2 = .6820161668496170657918E-09;
private static final double INV_GAMMA1P_M1_P3 = .4686843322948848031080E-10;
private static final double INV_GAMMA1P_M1_P4 = .1572833027710446286995E-11;
private static final double INV_GAMMA1P_M1_P5 = -.1249441572276366213222E-12;
private static final double INV_GAMMA1P_M1_P6 = .4343529937408594255178E-14;
private static final double INV_GAMMA1P_M1_Q1 = .3056961078365221025009E+00;
private static final double INV_GAMMA1P_M1_Q2 = .5464213086042296536016E-01;
private static final double INV_GAMMA1P_M1_Q3 = .4956830093825887312020E-02;
private static final double INV_GAMMA1P_M1_Q4 = .2692369466186361192876E-03;
private static final double INV_GAMMA1P_M1_C = -.422784335098467139393487909917598E+00;
private static final double INV_GAMMA1P_M1_C0 = .577215664901532860606512090082402E+00;
private static final double INV_GAMMA1P_M1_C1 = -.655878071520253881077019515145390E+00;
private static final double INV_GAMMA1P_M1_C2 = -.420026350340952355290039348754298E-01;
private static final double INV_GAMMA1P_M1_C3 = .166538611382291489501700795102105E+00;
private static final double INV_GAMMA1P_M1_C4 = -.421977345555443367482083012891874E-01;
private static final double INV_GAMMA1P_M1_C5 = -.962197152787697356211492167234820E-02;
private static final double INV_GAMMA1P_M1_C6 = .721894324666309954239501034044657E-02;
private static final double INV_GAMMA1P_M1_C7 = -.116516759185906511211397108401839E-02;
private static final double INV_GAMMA1P_M1_C8 = -.215241674114950972815729963053648E-03;
private static final double INV_GAMMA1P_M1_C9 = .128050282388116186153198626328164E-03;
private static final double INV_GAMMA1P_M1_C10 = -.201348547807882386556893914210218E-04;
private static final double INV_GAMMA1P_M1_C11 = -.125049348214267065734535947383309E-05;
private static final double INV_GAMMA1P_M1_C12 = .113302723198169588237412962033074E-05;
private static final double INV_GAMMA1P_M1_C13 = -.205633841697760710345015413002057E-06;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment