Skip to content

Instantly share code, notes, and snippets.

@ahmetaa
Created September 8, 2012 10:19
Show Gist options
  • Save ahmetaa/3673231 to your computer and use it in GitHub Desktop.
Save ahmetaa/3673231 to your computer and use it in GitHub Desktop.
mphf
/** modified - shortened Jenkins 32 */
int _hash(List<int> key, int seed) {
int h1 = seed;
for (int i in key) {
h1 += i;
h1 += (h1 << 10);
h1 ^= (h1 >> 6);
}
return h1 & 0x7fffffff;
}
int fingerPrint(List<int> key) => _hash(key, 0x9747b28c);
/**
* A Minimal Perfect Hash function which accepts keys that can be represented as List<int>.
* uses modified implementation of CHD algorithm. It is optimized for fast query of hash values.
* it uses about 3.5 bits memory per key for large amount of keys.
*/
class Mphf {
List<_HashIndexes> hashLevelData;
Mphf.generate(KeyProvider keyProvider) {
_BucketCalculator bc = new _BucketCalculator(keyProvider, -1);
this.hashLevelData = bc.calculate();
}
Mphf(this.hashLevelData);
int get Size => hashLevelData[0].keyAmount;
int get LevelCount => hashLevelData.length;
/** returns the minimal perfect hash value for the given input [key].
* Returning number is between [0-keycount] keycount excluded. */
int get(List<int> key) => getWithFingerPrint(key, fingerPrint(key));
/**
* returns the minimal perfect hash value for the given input [key].
* hash values is between [0-keycount] keycount excluded.
* sometimes initial hash value for MPHF calculation is
* already calculated. So [fingerprint] value is used instead of re-calculation.
* This provides a small performance enhancement.
*/
int getWithFingerPrint(List<int> key, int fingerPrint) {
for (int i = 0; i < hashLevelData.length; i++) {
int seed = (i == 0)
? hashLevelData[i].getSeedWithMask(fingerPrint)
: hashLevelData[i].getSeed(fingerPrint);
if (seed != 0) {
if (i == 0) {
return _hash(key, seed) % hashLevelData[0].keyAmount;
} else {
return hashLevelData[i - 1].failedIndexes[_hash(key, seed) % hashLevelData[i].keyAmount];
}
}
}
throw new ExpectException("Cannot be here.");
}
num totalBytesUsed() {
num result = 0;
for (_HashIndexes data in hashLevelData) {
result += data.bucketHashSeedValues.length;
result += data.failedIndexes.length * 4;
}
return result;
}
double averageBitsPerKey() => (totalBytesUsed() * 8).toDouble() / hashLevelData[0].keyAmount;
}
class _HashIndexes {
int keyAmount;
int bucketAmount;
// this is used for a small optimization. Only used for first level hash seeds.
int bucketMask;
List<int> bucketHashSeedValues;
List<int> failedIndexes;
_HashIndexes(this.keyAmount, this.bucketAmount, this.bucketHashSeedValues, this.failedIndexes) {
bucketMask = this.bucketAmount - 1;
}
int getSeedWithMask(int fingerPrint) => (bucketHashSeedValues[fingerPrint & bucketMask]) & 0xff;
int getSeed(int fingerPrint) => (bucketHashSeedValues[fingerPrint % bucketAmount]) & 0xff;
}
class _BucketCalculator {
KeyProvider keyProvider;
int keyAmount;
double averageKeysPerBucket;
static final int HASH_SEED_LIMIT = 255;
int bucketBits;
_BucketCalculator(this.keyProvider, this.bucketBits){
averageKeysPerBucket = bucketBits != -1
? keyProvider.keyAmount() / (1 << bucketBits)
: averageKeysPerBucket = 3.0;
}
List<_HashIndexes> calculate() {
keyAmount = keyProvider.keyAmount();
// for using bitwise AND instead of Modulo in calculations.
int bucketAmount;
if (bucketBits == -1)
bucketAmount = getInitialBucketAmount((keyAmount / averageKeysPerBucket).toInt(), keyAmount);
else
bucketAmount = 1 << bucketBits;
List<_Bucket> buckets = generateInitialBuckets(bucketAmount);
// sort buckets larger to smaller.
buckets.sort((_Bucket a, _Bucket b) => a.compareTo(b));
List<_HashIndexes> result = new List();
calculateIndexes(buckets, keyAmount, result);
return result;
}
// we select the bucket keyAmount a power of two. This way we can make modulo operations with &(bucketAmount-1) insltead of %bucketAmount.
//avoiding division. we select nearest 2's power after k and before limit.
int getInitialBucketAmount(int k, int limit) {
if (k <= 2)
return 1;
int i = 1;
while (i < k) {
i *= 2;
}
if (i >= limit)
return i >> 1;
else return i;
}
List<_Bucket> generateInitialBuckets(int bucketAmount) {
// for using bitwise AND instead of Modulo in calculations.
int bucketMask = bucketAmount - 1;
// generate buckets with item indexes.
foreachBucketIndex(void f(int index, int bucketIndex)) {
for (int i = 0; i < keyAmount; i++) {
List<int> key = keyProvider.getKey(i);
int bucketIndex = fingerPrint(key) & bucketMask;
f(i, bucketIndex);
}
}
// Generating buckets
List<_Bucket> buckets = new List(bucketAmount);
for (int i = 0; i < buckets.length; i++) {
buckets[i] = new _Bucket(i);
}
foreachBucketIndex((int index, int bucketIndex) => buckets[bucketIndex].add(index));
return buckets;
}
void calculateIndexes(List<_Bucket> buckets, int keyAmount, List<_HashIndexes> indexes) {
// generate a long bit vector with size of hash target size.
_FixedBitVector bitVector = new _FixedBitVector.bitCount(keyAmount);
List<int> hashSeedArray = new List()..insertRange(0, buckets.length, 1);
// we need to collect failed buckets (A failed bucket such that we cannot find empty slots for all bucket keys
// after 255 trials. )
List<_Bucket> failedBuckets = new List()..insertRange(0,buckets.length ~/ 20);
// for each bucket, find a hash function that will map each key in it to an empty slot in bitVector.
int bucketIndex = 0;
for (_Bucket bucket in buckets) {
if (bucket.itemIndexes.length == 0) // because buckets are sorted, we can finish here.
break;
int l = 1;
bool loop = true;
while (loop) {
int j = 0;
List<int> slots = new List(bucket.itemIndexes.length);
for (int keyIndex in bucket.itemIndexes) {
List<int> key = keyProvider.getKey(keyIndex);
slots[j] = _hash(key, l) % keyAmount;
if (bitVector.get(slots[j]))
break;
else {
bitVector.set(slots[j]);
j++;
}
}
// if we fail to place all items in the bucket to the bitvector"s empty slots
if (j < bucket.itemIndexes.length) {
// we reset the occupied slots from bitvector.
for (int k = 0; k < j; k++) {
bitVector.clear(slots[k]);
}
// We reached the HASH_SEED_LIMIT.
// We place a 0 for its hash index value to know later that bucket is left to secondary lookup.
if (l == HASH_SEED_LIMIT) {
failedBuckets.add(bucket);
hashSeedArray[buckets[bucketIndex].id] = 0;
loop = false;
}
} else { // sweet. We have found empty slots in bit vector for all keys of the bucket.
hashSeedArray[buckets[bucketIndex].id] = l;
loop = false;
}
l++;
}
bucketIndex++;
}
if (failedBuckets.length == 0) {
// we are done.
indexes.add(new _HashIndexes(keyAmount, buckets.length, hashSeedArray, new List(0)));
return;
}
// we assign lower average per key per bucket after each iteration to avoid generation failure.
if (averageKeysPerBucket > 1)
averageKeysPerBucket--;
// start calculation for failed buckets.
int failedKeyCount = 0;
for (_Bucket failedBucket in failedBuckets) {
failedKeyCount += failedBucket.itemIndexes.length;
}
int failedBucketAmount = (failedKeyCount / averageKeysPerBucket).toInt();
// this is a worst case scenario. No empty slot find for any buckets and we are already using buckets where bucket Amount>=keyAmount
// In this case we double the bucket size with the hope that it will have better bucket-key distribution.
if (failedKeyCount == keyAmount && averageKeysPerBucket <= 1) {
averageKeysPerBucket = averageKeysPerBucket / 2;
failedBucketAmount *= 2;
}
if (failedBucketAmount == 0)
failedBucketAmount++;
// this time we generate item keyAmount of Buckets
List<_Bucket> nextLevelBuckets = new List(failedBucketAmount);
for (int i = 0; i < failedBucketAmount; i++) {
nextLevelBuckets[i] = new _Bucket(i);
}
// generate secondary buckets with item indexes.
for (_Bucket largeHashIndexBucket in failedBuckets) {
for (int itemIndex in largeHashIndexBucket.itemIndexes) {
List<int> key = keyProvider.getKey(itemIndex);
int secondaryBucketIndex = fingerPrint(key) % failedBucketAmount;
nextLevelBuckets[secondaryBucketIndex].add(itemIndex);
}
}
// sort buckets larger to smaller.
nextLevelBuckets.sort((_Bucket a, _Bucket b) => a.compareTo(b));
List<int> failedHashValues;
int currentLevel = indexes.length;
if (currentLevel == 0) {
// if we are in the first level generate a bit vector with the size of zero indexes of the primary bit vector.
failedHashValues = bitVector.zeroIntIndexes();
} else {
failedHashValues = new List(failedKeyCount);
int k = 0;
for (int i = 0; i < bitVector.size; i++) {
if (!bitVector.get(i))
failedHashValues[k++] = indexes[currentLevel - 1].failedIndexes[i];
}
}
indexes.add(new _HashIndexes(keyAmount, buckets.length, hashSeedArray, failedHashValues));
// recurse for failed buckets.
calculateIndexes(nextLevelBuckets, failedKeyCount, indexes);
}
}
/** A bucket that holds keys. It contains a small array for keys. */
class _Bucket implements Comparable {
int id;
List<int> itemIndexes = new List();
_Bucket(this.id);
void add(int i) { itemIndexes.add(i); }
int compareTo(_Bucket o) => itemIndexes.length.compareTo(o.itemIndexes.length);
}
class _FixedBitVector {
List<int> _words;
int size;
// for fast modulo 32 calculation
static final int mod32Mask = 0x1F;
List<int> _setMasks = new List(32);
List<int> _resetMasks = new List(32);
List<int> _cutMasks = new List(32);
_initialize(int initialCapacity) {
size = initialCapacity;
_words = new List()..insertRange(0, initialCapacity, 0);
for (int i = 0; i < 32; i++) {
_setMasks[i] = 0x1 << i;
_resetMasks[i] = ~_setMasks[i];
_cutMasks[i] = ~(0xfffffffe << i);
}
}
/** Creates a fixed bit vector with capacity of [bitCount]. */
_FixedBitVector.bitCount(int bitCount) {
_initialize(bitCount);
}
bool get (int n) => (_words[n >> 6] & _setMasks[n & mod32Mask]) != 0;
void set (int n) {
_words[n >> 6] |= _setMasks[n & mod32Mask];
}
void clear(int n) {
_words[n >> 6] &= _resetMasks[n & mod32Mask];
}
/** creates an array containing 0 bit indexes. */
List<int> zeroIntIndexes() {
var zeroIndexes = new List<int>();
int j = 0;
for (int i = 0; i < size; i++) {
if (!get(i))
zeroIndexes.add(i);
}
return zeroIndexes;
}
}
class KeyProvider {
List<List<int>> list = new List();
KeyProvider(this.list);
KeyProvider.fromStrings(List<String> vals) {
for(String s in vals) {
list.add(s.charCodes());
}
}
List<int> getKey(int index) => list[index];
int keyAmount() => list.length;
}
void main() {
var keys = ["elma","armut","turp","kiraz","karpuz", "kavun"];
var hash = new Mphf.generate( new KeyProvider.fromStrings(keys));
for(var key in keys) {
print("for key [$key] hash value is: ${hash.get(key.charCodes())}");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment