Created
September 8, 2012 10:19
-
-
Save ahmetaa/3673231 to your computer and use it in GitHub Desktop.
mphf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** modified - shortened Jenkins 32 */ | |
int _hash(List<int> key, int seed) { | |
int h1 = seed; | |
for (int i in key) { | |
h1 += i; | |
h1 += (h1 << 10); | |
h1 ^= (h1 >> 6); | |
} | |
return h1 & 0x7fffffff; | |
} | |
int fingerPrint(List<int> key) => _hash(key, 0x9747b28c); | |
/** | |
* A Minimal Perfect Hash function which accepts keys that can be represented as List<int>. | |
* uses modified implementation of CHD algorithm. It is optimized for fast query of hash values. | |
* it uses about 3.5 bits memory per key for large amount of keys. | |
*/ | |
class Mphf { | |
List<_HashIndexes> hashLevelData; | |
Mphf.generate(KeyProvider keyProvider) { | |
_BucketCalculator bc = new _BucketCalculator(keyProvider, -1); | |
this.hashLevelData = bc.calculate(); | |
} | |
Mphf(this.hashLevelData); | |
int get Size => hashLevelData[0].keyAmount; | |
int get LevelCount => hashLevelData.length; | |
/** returns the minimal perfect hash value for the given input [key]. | |
* Returning number is between [0-keycount] keycount excluded. */ | |
int get(List<int> key) => getWithFingerPrint(key, fingerPrint(key)); | |
/** | |
* returns the minimal perfect hash value for the given input [key]. | |
* hash values is between [0-keycount] keycount excluded. | |
* sometimes initial hash value for MPHF calculation is | |
* already calculated. So [fingerprint] value is used instead of re-calculation. | |
* This provides a small performance enhancement. | |
*/ | |
int getWithFingerPrint(List<int> key, int fingerPrint) { | |
for (int i = 0; i < hashLevelData.length; i++) { | |
int seed = (i == 0) | |
? hashLevelData[i].getSeedWithMask(fingerPrint) | |
: hashLevelData[i].getSeed(fingerPrint); | |
if (seed != 0) { | |
if (i == 0) { | |
return _hash(key, seed) % hashLevelData[0].keyAmount; | |
} else { | |
return hashLevelData[i - 1].failedIndexes[_hash(key, seed) % hashLevelData[i].keyAmount]; | |
} | |
} | |
} | |
throw new ExpectException("Cannot be here."); | |
} | |
num totalBytesUsed() { | |
num result = 0; | |
for (_HashIndexes data in hashLevelData) { | |
result += data.bucketHashSeedValues.length; | |
result += data.failedIndexes.length * 4; | |
} | |
return result; | |
} | |
double averageBitsPerKey() => (totalBytesUsed() * 8).toDouble() / hashLevelData[0].keyAmount; | |
} | |
class _HashIndexes { | |
int keyAmount; | |
int bucketAmount; | |
// this is used for a small optimization. Only used for first level hash seeds. | |
int bucketMask; | |
List<int> bucketHashSeedValues; | |
List<int> failedIndexes; | |
_HashIndexes(this.keyAmount, this.bucketAmount, this.bucketHashSeedValues, this.failedIndexes) { | |
bucketMask = this.bucketAmount - 1; | |
} | |
int getSeedWithMask(int fingerPrint) => (bucketHashSeedValues[fingerPrint & bucketMask]) & 0xff; | |
int getSeed(int fingerPrint) => (bucketHashSeedValues[fingerPrint % bucketAmount]) & 0xff; | |
} | |
class _BucketCalculator { | |
KeyProvider keyProvider; | |
int keyAmount; | |
double averageKeysPerBucket; | |
static final int HASH_SEED_LIMIT = 255; | |
int bucketBits; | |
_BucketCalculator(this.keyProvider, this.bucketBits){ | |
averageKeysPerBucket = bucketBits != -1 | |
? keyProvider.keyAmount() / (1 << bucketBits) | |
: averageKeysPerBucket = 3.0; | |
} | |
List<_HashIndexes> calculate() { | |
keyAmount = keyProvider.keyAmount(); | |
// for using bitwise AND instead of Modulo in calculations. | |
int bucketAmount; | |
if (bucketBits == -1) | |
bucketAmount = getInitialBucketAmount((keyAmount / averageKeysPerBucket).toInt(), keyAmount); | |
else | |
bucketAmount = 1 << bucketBits; | |
List<_Bucket> buckets = generateInitialBuckets(bucketAmount); | |
// sort buckets larger to smaller. | |
buckets.sort((_Bucket a, _Bucket b) => a.compareTo(b)); | |
List<_HashIndexes> result = new List(); | |
calculateIndexes(buckets, keyAmount, result); | |
return result; | |
} | |
// we select the bucket keyAmount a power of two. This way we can make modulo operations with &(bucketAmount-1) insltead of %bucketAmount. | |
//avoiding division. we select nearest 2's power after k and before limit. | |
int getInitialBucketAmount(int k, int limit) { | |
if (k <= 2) | |
return 1; | |
int i = 1; | |
while (i < k) { | |
i *= 2; | |
} | |
if (i >= limit) | |
return i >> 1; | |
else return i; | |
} | |
List<_Bucket> generateInitialBuckets(int bucketAmount) { | |
// for using bitwise AND instead of Modulo in calculations. | |
int bucketMask = bucketAmount - 1; | |
// generate buckets with item indexes. | |
foreachBucketIndex(void f(int index, int bucketIndex)) { | |
for (int i = 0; i < keyAmount; i++) { | |
List<int> key = keyProvider.getKey(i); | |
int bucketIndex = fingerPrint(key) & bucketMask; | |
f(i, bucketIndex); | |
} | |
} | |
// Generating buckets | |
List<_Bucket> buckets = new List(bucketAmount); | |
for (int i = 0; i < buckets.length; i++) { | |
buckets[i] = new _Bucket(i); | |
} | |
foreachBucketIndex((int index, int bucketIndex) => buckets[bucketIndex].add(index)); | |
return buckets; | |
} | |
void calculateIndexes(List<_Bucket> buckets, int keyAmount, List<_HashIndexes> indexes) { | |
// generate a long bit vector with size of hash target size. | |
_FixedBitVector bitVector = new _FixedBitVector.bitCount(keyAmount); | |
List<int> hashSeedArray = new List()..insertRange(0, buckets.length, 1); | |
// we need to collect failed buckets (A failed bucket such that we cannot find empty slots for all bucket keys | |
// after 255 trials. ) | |
List<_Bucket> failedBuckets = new List()..insertRange(0,buckets.length ~/ 20); | |
// for each bucket, find a hash function that will map each key in it to an empty slot in bitVector. | |
int bucketIndex = 0; | |
for (_Bucket bucket in buckets) { | |
if (bucket.itemIndexes.length == 0) // because buckets are sorted, we can finish here. | |
break; | |
int l = 1; | |
bool loop = true; | |
while (loop) { | |
int j = 0; | |
List<int> slots = new List(bucket.itemIndexes.length); | |
for (int keyIndex in bucket.itemIndexes) { | |
List<int> key = keyProvider.getKey(keyIndex); | |
slots[j] = _hash(key, l) % keyAmount; | |
if (bitVector.get(slots[j])) | |
break; | |
else { | |
bitVector.set(slots[j]); | |
j++; | |
} | |
} | |
// if we fail to place all items in the bucket to the bitvector"s empty slots | |
if (j < bucket.itemIndexes.length) { | |
// we reset the occupied slots from bitvector. | |
for (int k = 0; k < j; k++) { | |
bitVector.clear(slots[k]); | |
} | |
// We reached the HASH_SEED_LIMIT. | |
// We place a 0 for its hash index value to know later that bucket is left to secondary lookup. | |
if (l == HASH_SEED_LIMIT) { | |
failedBuckets.add(bucket); | |
hashSeedArray[buckets[bucketIndex].id] = 0; | |
loop = false; | |
} | |
} else { // sweet. We have found empty slots in bit vector for all keys of the bucket. | |
hashSeedArray[buckets[bucketIndex].id] = l; | |
loop = false; | |
} | |
l++; | |
} | |
bucketIndex++; | |
} | |
if (failedBuckets.length == 0) { | |
// we are done. | |
indexes.add(new _HashIndexes(keyAmount, buckets.length, hashSeedArray, new List(0))); | |
return; | |
} | |
// we assign lower average per key per bucket after each iteration to avoid generation failure. | |
if (averageKeysPerBucket > 1) | |
averageKeysPerBucket--; | |
// start calculation for failed buckets. | |
int failedKeyCount = 0; | |
for (_Bucket failedBucket in failedBuckets) { | |
failedKeyCount += failedBucket.itemIndexes.length; | |
} | |
int failedBucketAmount = (failedKeyCount / averageKeysPerBucket).toInt(); | |
// this is a worst case scenario. No empty slot find for any buckets and we are already using buckets where bucket Amount>=keyAmount | |
// In this case we double the bucket size with the hope that it will have better bucket-key distribution. | |
if (failedKeyCount == keyAmount && averageKeysPerBucket <= 1) { | |
averageKeysPerBucket = averageKeysPerBucket / 2; | |
failedBucketAmount *= 2; | |
} | |
if (failedBucketAmount == 0) | |
failedBucketAmount++; | |
// this time we generate item keyAmount of Buckets | |
List<_Bucket> nextLevelBuckets = new List(failedBucketAmount); | |
for (int i = 0; i < failedBucketAmount; i++) { | |
nextLevelBuckets[i] = new _Bucket(i); | |
} | |
// generate secondary buckets with item indexes. | |
for (_Bucket largeHashIndexBucket in failedBuckets) { | |
for (int itemIndex in largeHashIndexBucket.itemIndexes) { | |
List<int> key = keyProvider.getKey(itemIndex); | |
int secondaryBucketIndex = fingerPrint(key) % failedBucketAmount; | |
nextLevelBuckets[secondaryBucketIndex].add(itemIndex); | |
} | |
} | |
// sort buckets larger to smaller. | |
nextLevelBuckets.sort((_Bucket a, _Bucket b) => a.compareTo(b)); | |
List<int> failedHashValues; | |
int currentLevel = indexes.length; | |
if (currentLevel == 0) { | |
// if we are in the first level generate a bit vector with the size of zero indexes of the primary bit vector. | |
failedHashValues = bitVector.zeroIntIndexes(); | |
} else { | |
failedHashValues = new List(failedKeyCount); | |
int k = 0; | |
for (int i = 0; i < bitVector.size; i++) { | |
if (!bitVector.get(i)) | |
failedHashValues[k++] = indexes[currentLevel - 1].failedIndexes[i]; | |
} | |
} | |
indexes.add(new _HashIndexes(keyAmount, buckets.length, hashSeedArray, failedHashValues)); | |
// recurse for failed buckets. | |
calculateIndexes(nextLevelBuckets, failedKeyCount, indexes); | |
} | |
} | |
/** A bucket that holds keys. It contains a small array for keys. */ | |
class _Bucket implements Comparable { | |
int id; | |
List<int> itemIndexes = new List(); | |
_Bucket(this.id); | |
void add(int i) { itemIndexes.add(i); } | |
int compareTo(_Bucket o) => itemIndexes.length.compareTo(o.itemIndexes.length); | |
} | |
class _FixedBitVector { | |
List<int> _words; | |
int size; | |
// for fast modulo 32 calculation | |
static final int mod32Mask = 0x1F; | |
List<int> _setMasks = new List(32); | |
List<int> _resetMasks = new List(32); | |
List<int> _cutMasks = new List(32); | |
_initialize(int initialCapacity) { | |
size = initialCapacity; | |
_words = new List()..insertRange(0, initialCapacity, 0); | |
for (int i = 0; i < 32; i++) { | |
_setMasks[i] = 0x1 << i; | |
_resetMasks[i] = ~_setMasks[i]; | |
_cutMasks[i] = ~(0xfffffffe << i); | |
} | |
} | |
/** Creates a fixed bit vector with capacity of [bitCount]. */ | |
_FixedBitVector.bitCount(int bitCount) { | |
_initialize(bitCount); | |
} | |
bool get (int n) => (_words[n >> 6] & _setMasks[n & mod32Mask]) != 0; | |
void set (int n) { | |
_words[n >> 6] |= _setMasks[n & mod32Mask]; | |
} | |
void clear(int n) { | |
_words[n >> 6] &= _resetMasks[n & mod32Mask]; | |
} | |
/** creates an array containing 0 bit indexes. */ | |
List<int> zeroIntIndexes() { | |
var zeroIndexes = new List<int>(); | |
int j = 0; | |
for (int i = 0; i < size; i++) { | |
if (!get(i)) | |
zeroIndexes.add(i); | |
} | |
return zeroIndexes; | |
} | |
} | |
class KeyProvider { | |
List<List<int>> list = new List(); | |
KeyProvider(this.list); | |
KeyProvider.fromStrings(List<String> vals) { | |
for(String s in vals) { | |
list.add(s.charCodes()); | |
} | |
} | |
List<int> getKey(int index) => list[index]; | |
int keyAmount() => list.length; | |
} | |
void main() { | |
var keys = ["elma","armut","turp","kiraz","karpuz", "kavun"]; | |
var hash = new Mphf.generate( new KeyProvider.fromStrings(keys)); | |
for(var key in keys) { | |
print("for key [$key] hash value is: ${hash.get(key.charCodes())}"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment