ahmetaa/gist:3673231

## gistfile1.dart

/** modified - shortened Jenkins 32 */
int _hash(List<int> key, int seed) {
  int h1 = seed;
  for (int i in key) {
    h1 += i;
    h1 += (h1 << 10);
    h1 ^= (h1 >> 6);
  }
  return h1 & 0x7fffffff;
}

int fingerPrint(List<int> key) => _hash(key, 0x9747b28c);

/**
 * A Minimal Perfect Hash function which accepts keys that can be represented as List<int>.
 * uses modified implementation of CHD algorithm. It is optimized for fast query of hash values.
 * it uses about 3.5 bits memory per key for large amount of keys.
 */
class Mphf {

    List<_HashIndexes> hashLevelData;

    Mphf.generate(KeyProvider keyProvider) {
        _BucketCalculator bc = new _BucketCalculator(keyProvider, -1);
        this.hashLevelData = bc.calculate();
    }

    Mphf(this.hashLevelData);

    int get Size => hashLevelData[0].keyAmount;

    int get LevelCount => hashLevelData.length;

    /** returns the minimal perfect hash value for the given input [key].
     * Returning number is between [0-keycount] keycount excluded.  */
    int get(List<int> key) => getWithFingerPrint(key, fingerPrint(key));

    /**
     * returns the minimal perfect hash value for the given input [key].
     * hash values is between [0-keycount] keycount excluded.
     * sometimes initial hash value for MPHF calculation is
     * already calculated. So [fingerprint] value is used instead of re-calculation.
     * This provides a small performance enhancement.
     */
    int getWithFingerPrint(List<int> key, int fingerPrint) {
      for (int i = 0; i < hashLevelData.length; i++) {
        int seed = (i == 0)
          ? hashLevelData[i].getSeedWithMask(fingerPrint)
          : hashLevelData[i].getSeed(fingerPrint);
        if (seed != 0) {
          if (i == 0) {
            return _hash(key, seed) % hashLevelData[0].keyAmount;
          } else {
            return hashLevelData[i - 1].failedIndexes[_hash(key, seed) % hashLevelData[i].keyAmount];
          }
        }
      }
      throw new ExpectException("Cannot be here.");
    }

    num totalBytesUsed() {
      num result = 0;
      for (_HashIndexes data in hashLevelData) {
        result += data.bucketHashSeedValues.length;
        result += data.failedIndexes.length * 4;
      }
      return result;
    }

  double averageBitsPerKey() => (totalBytesUsed() * 8).toDouble() / hashLevelData[0].keyAmount;
}

class _HashIndexes {
  int keyAmount;
  int bucketAmount;
  // this is used for a small optimization. Only used for first level hash seeds.
  int bucketMask;
  List<int> bucketHashSeedValues;
  List<int> failedIndexes;

  _HashIndexes(this.keyAmount, this.bucketAmount, this.bucketHashSeedValues, this.failedIndexes) {
      bucketMask = this.bucketAmount - 1;
  }

  int getSeedWithMask(int fingerPrint) => (bucketHashSeedValues[fingerPrint & bucketMask]) & 0xff;

  int getSeed(int fingerPrint) => (bucketHashSeedValues[fingerPrint % bucketAmount]) & 0xff;

}

class _BucketCalculator {

  KeyProvider keyProvider;
  int keyAmount;
  double averageKeysPerBucket;
  static final int HASH_SEED_LIMIT = 255;
  int bucketBits;

  _BucketCalculator(this.keyProvider, this.bucketBits){
    averageKeysPerBucket = bucketBits != -1
      ? keyProvider.keyAmount() / (1 << bucketBits)
      : averageKeysPerBucket = 3.0;
  }

  List<_HashIndexes> calculate() {
    keyAmount = keyProvider.keyAmount();

    // for using bitwise AND instead of Modulo in calculations.
    int bucketAmount;

    if (bucketBits == -1)
      bucketAmount = getInitialBucketAmount((keyAmount / averageKeysPerBucket).toInt(), keyAmount);
    else
      bucketAmount = 1 << bucketBits;

    List<_Bucket> buckets = generateInitialBuckets(bucketAmount);

    // sort buckets larger to smaller.
    buckets.sort((_Bucket a, _Bucket b) => a.compareTo(b));

    List<_HashIndexes> result = new List();

    calculateIndexes(buckets, keyAmount, result);

    return result;
  }

  // we select the bucket keyAmount a power of two. This way we can make modulo operations with &(bucketAmount-1) insltead of %bucketAmount.
  //avoiding division. we select nearest 2's power after k and before limit.
  int getInitialBucketAmount(int k, int limit) {
    if (k <= 2)
      return 1;
    int i = 1;
    while (i < k) {
      i *= 2;
    }
    if (i >= limit)
      return i >> 1;
    else return i;
  }

  List<_Bucket> generateInitialBuckets(int bucketAmount) {

    // for using bitwise AND instead of Modulo in calculations.
    int bucketMask = bucketAmount - 1;


    // generate buckets with item indexes.
    foreachBucketIndex(void f(int index, int bucketIndex)) {
      for (int i = 0; i < keyAmount; i++) {
        List<int> key = keyProvider.getKey(i);
        int bucketIndex = fingerPrint(key) & bucketMask;
        f(i, bucketIndex);
      }
    }

    // Generating buckets
    List<_Bucket> buckets = new List(bucketAmount);
    for (int i = 0; i < buckets.length; i++) {
      buckets[i] = new _Bucket(i);
    }

    foreachBucketIndex((int index, int bucketIndex) =>  buckets[bucketIndex].add(index));
    return buckets;
  }

  void calculateIndexes(List<_Bucket> buckets, int keyAmount, List<_HashIndexes> indexes) {

    // generate a long bit vector with size of hash target size.
    _FixedBitVector bitVector = new _FixedBitVector.bitCount(keyAmount);

    List<int> hashSeedArray = new List()..insertRange(0, buckets.length, 1);

    // we need to collect failed buckets (A failed bucket such that we cannot find empty slots for all bucket keys
    // after 255 trials. )
    List<_Bucket> failedBuckets = new List()..insertRange(0,buckets.length ~/ 20);

    // for each bucket, find a hash function that will map each key in it to an empty slot in bitVector.
    int bucketIndex = 0;

    for (_Bucket bucket in buckets) {
      if (bucket.itemIndexes.length == 0) // because buckets are sorted, we can finish here.
        break;
      int l = 1;
      bool loop = true;
      while (loop) {
        int j = 0;
        List<int> slots = new List(bucket.itemIndexes.length);
        for (int keyIndex in bucket.itemIndexes) {
          List<int> key = keyProvider.getKey(keyIndex);
          slots[j] = _hash(key, l) % keyAmount;
          if (bitVector.get(slots[j]))
            break;
          else {
            bitVector.set(slots[j]);
            j++;
          }
        }
        // if we fail to place all items in the bucket to the bitvector"s empty slots
        if (j < bucket.itemIndexes.length) {
          // we reset the occupied slots from bitvector.
          for (int k = 0; k < j; k++) {
            bitVector.clear(slots[k]);
          }
          // We reached the HASH_SEED_LIMIT.
          // We place a 0 for its hash index value to know later that bucket is left to secondary lookup.
          if (l == HASH_SEED_LIMIT) {
            failedBuckets.add(bucket);
            hashSeedArray[buckets[bucketIndex].id] = 0;
            loop = false;
          }

        } else { // sweet. We have found empty slots in bit vector for all keys of the bucket.
          hashSeedArray[buckets[bucketIndex].id] = l;
          loop = false;
        }
        l++;
      }
      bucketIndex++;
    }

    if (failedBuckets.length == 0) {
      // we are done.
      indexes.add(new _HashIndexes(keyAmount, buckets.length, hashSeedArray, new List(0)));
      return;
    }

    // we assign lower average per key per bucket after each iteration to avoid generation failure.
    if (averageKeysPerBucket > 1)
      averageKeysPerBucket--;

    // start calculation for failed buckets.
    int failedKeyCount = 0;
    for (_Bucket failedBucket in failedBuckets) {
      failedKeyCount += failedBucket.itemIndexes.length;
    }
    int failedBucketAmount = (failedKeyCount / averageKeysPerBucket).toInt();

    // this is a worst case scenario. No empty slot find for any buckets and we are already using buckets where bucket Amount>=keyAmount
    // In this case we double the bucket size with the hope that it will have better bucket-key distribution.
    if (failedKeyCount == keyAmount && averageKeysPerBucket <= 1) {
      averageKeysPerBucket = averageKeysPerBucket / 2;
      failedBucketAmount *= 2;
    }

    if (failedBucketAmount == 0)
      failedBucketAmount++;

    // this time we generate item keyAmount of Buckets
    List<_Bucket> nextLevelBuckets = new List(failedBucketAmount);
    for (int i = 0; i < failedBucketAmount; i++) {
      nextLevelBuckets[i] = new _Bucket(i);
    }

    // generate secondary buckets with item indexes.
    for (_Bucket largeHashIndexBucket in failedBuckets) {
      for (int itemIndex in largeHashIndexBucket.itemIndexes) {
        List<int> key = keyProvider.getKey(itemIndex);
        int secondaryBucketIndex = fingerPrint(key) % failedBucketAmount;
        nextLevelBuckets[secondaryBucketIndex].add(itemIndex);
      }
    }

    // sort buckets larger to smaller.
    nextLevelBuckets.sort((_Bucket a, _Bucket b) => a.compareTo(b));

    List<int> failedHashValues;
    int currentLevel = indexes.length;
    if (currentLevel == 0) {
      // if we are in the first level  generate a bit vector with the size of zero indexes of the primary bit vector.
      failedHashValues = bitVector.zeroIntIndexes();
    } else {
      failedHashValues = new List(failedKeyCount);
      int k = 0;
      for (int i = 0; i < bitVector.size; i++) {
        if (!bitVector.get(i))
          failedHashValues[k++] = indexes[currentLevel - 1].failedIndexes[i];
      }
    }
    indexes.add(new _HashIndexes(keyAmount, buckets.length, hashSeedArray, failedHashValues));

    // recurse for failed buckets.
    calculateIndexes(nextLevelBuckets, failedKeyCount, indexes);
  }
}

/** A bucket that holds keys. It contains a small array for keys. */
class _Bucket implements Comparable {
    int id;
    List<int> itemIndexes = new List();
    _Bucket(this.id);
    void add(int i) { itemIndexes.add(i); }
    int compareTo(_Bucket o) => itemIndexes.length.compareTo(o.itemIndexes.length);
}

class _FixedBitVector {
  List<int> _words;
  int size;

  // for fast modulo 32 calculation
  static final int mod32Mask = 0x1F;

  List<int> _setMasks = new List(32);
  List<int> _resetMasks = new List(32);
  List<int> _cutMasks = new List(32);

  _initialize(int initialCapacity) {
      size = initialCapacity;
      _words = new List()..insertRange(0, initialCapacity, 0);
      for (int i = 0; i < 32; i++) {
        _setMasks[i] = 0x1 << i;
        _resetMasks[i] = ~_setMasks[i];
        _cutMasks[i] = ~(0xfffffffe << i);
      }
  }

  /** Creates a fixed bit vector with capacity of [bitCount]. */
  _FixedBitVector.bitCount(int bitCount) {
    _initialize(bitCount);
  }

  bool get (int n) => (_words[n >> 6] & _setMasks[n & mod32Mask]) != 0;

  void set (int n) {
    _words[n >> 6] |= _setMasks[n & mod32Mask];
  }

  void clear(int n) {
    _words[n >> 6] &= _resetMasks[n & mod32Mask];
  }

  /** creates an array containing 0 bit indexes. */
  List<int> zeroIntIndexes() {
    var zeroIndexes = new List<int>();
    int j = 0;
    for (int i = 0; i < size; i++) {
      if (!get(i))
        zeroIndexes.add(i);
    }
    return zeroIndexes;
  }

}

class KeyProvider {
   List<List<int>> list = new List();
   KeyProvider(this.list);

   KeyProvider.fromStrings(List<String> vals) {
     for(String s in vals) {
       list.add(s.charCodes());
     }
   }

   List<int> getKey(int index) => list[index];

   int keyAmount() => list.length;
}

void main() {
  var keys = ["elma","armut","turp","kiraz","karpuz", "kavun"];
  var hash =  new Mphf.generate( new KeyProvider.fromStrings(keys));
  for(var key in keys) {
    print("for key [$key] hash value is: ${hash.get(key.charCodes())}");
  }
}

	/** modified - shortened Jenkins 32 */
	int _hash(List<int> key, int seed) {
	int h1 = seed;
	for (int i in key) {
	h1 += i;
	h1 += (h1 << 10);
	h1 ^= (h1 >> 6);
	}
	return h1 & 0x7fffffff;
	}

	int fingerPrint(List<int> key) => _hash(key, 0x9747b28c);

	/**
	* A Minimal Perfect Hash function which accepts keys that can be represented as List<int>.
	* uses modified implementation of CHD algorithm. It is optimized for fast query of hash values.
	* it uses about 3.5 bits memory per key for large amount of keys.
	*/
	class Mphf {

	List<_HashIndexes> hashLevelData;

	Mphf.generate(KeyProvider keyProvider) {
	_BucketCalculator bc = new _BucketCalculator(keyProvider, -1);
	this.hashLevelData = bc.calculate();
	}

	Mphf(this.hashLevelData);

	int get Size => hashLevelData[0].keyAmount;

	int get LevelCount => hashLevelData.length;

	/** returns the minimal perfect hash value for the given input [key].
	* Returning number is between [0-keycount] keycount excluded. */
	int get(List<int> key) => getWithFingerPrint(key, fingerPrint(key));

	/**
	* returns the minimal perfect hash value for the given input [key].
	* hash values is between [0-keycount] keycount excluded.
	* sometimes initial hash value for MPHF calculation is
	* already calculated. So [fingerprint] value is used instead of re-calculation.
	* This provides a small performance enhancement.
	*/
	int getWithFingerPrint(List<int> key, int fingerPrint) {
	for (int i = 0; i < hashLevelData.length; i++) {
	int seed = (i == 0)
	? hashLevelData[i].getSeedWithMask(fingerPrint)
	: hashLevelData[i].getSeed(fingerPrint);
	if (seed != 0) {
	if (i == 0) {
	return _hash(key, seed) % hashLevelData[0].keyAmount;
	} else {
	return hashLevelData[i - 1].failedIndexes[_hash(key, seed) % hashLevelData[i].keyAmount];
	}
	}
	}
	throw new ExpectException("Cannot be here.");
	}

	num totalBytesUsed() {
	num result = 0;
	for (_HashIndexes data in hashLevelData) {
	result += data.bucketHashSeedValues.length;
	result += data.failedIndexes.length * 4;
	}
	return result;
	}

	double averageBitsPerKey() => (totalBytesUsed() * 8).toDouble() / hashLevelData[0].keyAmount;
	}

	class _HashIndexes {
	int keyAmount;
	int bucketAmount;
	// this is used for a small optimization. Only used for first level hash seeds.
	int bucketMask;
	List<int> bucketHashSeedValues;
	List<int> failedIndexes;

	_HashIndexes(this.keyAmount, this.bucketAmount, this.bucketHashSeedValues, this.failedIndexes) {
	bucketMask = this.bucketAmount - 1;
	}

	int getSeedWithMask(int fingerPrint) => (bucketHashSeedValues[fingerPrint & bucketMask]) & 0xff;

	int getSeed(int fingerPrint) => (bucketHashSeedValues[fingerPrint % bucketAmount]) & 0xff;

	}

	class _BucketCalculator {

	KeyProvider keyProvider;
	int keyAmount;
	double averageKeysPerBucket;
	static final int HASH_SEED_LIMIT = 255;
	int bucketBits;

	_BucketCalculator(this.keyProvider, this.bucketBits){
	averageKeysPerBucket = bucketBits != -1
	? keyProvider.keyAmount() / (1 << bucketBits)
	: averageKeysPerBucket = 3.0;
	}

	List<_HashIndexes> calculate() {
	keyAmount = keyProvider.keyAmount();

	// for using bitwise AND instead of Modulo in calculations.
	int bucketAmount;

	if (bucketBits == -1)
	bucketAmount = getInitialBucketAmount((keyAmount / averageKeysPerBucket).toInt(), keyAmount);
	else
	bucketAmount = 1 << bucketBits;

	List<_Bucket> buckets = generateInitialBuckets(bucketAmount);

	// sort buckets larger to smaller.
	buckets.sort((_Bucket a, _Bucket b) => a.compareTo(b));

	List<_HashIndexes> result = new List();

	calculateIndexes(buckets, keyAmount, result);

	return result;
	}

	// we select the bucket keyAmount a power of two. This way we can make modulo operations with &(bucketAmount-1) insltead of %bucketAmount.
	//avoiding division. we select nearest 2's power after k and before limit.
	int getInitialBucketAmount(int k, int limit) {
	if (k <= 2)
	return 1;
	int i = 1;
	while (i < k) {
	i *= 2;
	}
	if (i >= limit)
	return i >> 1;
	else return i;
	}

	List<_Bucket> generateInitialBuckets(int bucketAmount) {

	// for using bitwise AND instead of Modulo in calculations.
	int bucketMask = bucketAmount - 1;


	// generate buckets with item indexes.
	foreachBucketIndex(void f(int index, int bucketIndex)) {
	for (int i = 0; i < keyAmount; i++) {
	List<int> key = keyProvider.getKey(i);
	int bucketIndex = fingerPrint(key) & bucketMask;
	f(i, bucketIndex);
	}
	}

	// Generating buckets
	List<_Bucket> buckets = new List(bucketAmount);
	for (int i = 0; i < buckets.length; i++) {
	buckets[i] = new _Bucket(i);
	}

	foreachBucketIndex((int index, int bucketIndex) => buckets[bucketIndex].add(index));
	return buckets;
	}

	void calculateIndexes(List<_Bucket> buckets, int keyAmount, List<_HashIndexes> indexes) {

	// generate a long bit vector with size of hash target size.
	_FixedBitVector bitVector = new _FixedBitVector.bitCount(keyAmount);

	List<int> hashSeedArray = new List()..insertRange(0, buckets.length, 1);

	// we need to collect failed buckets (A failed bucket such that we cannot find empty slots for all bucket keys
	// after 255 trials. )
	List<_Bucket> failedBuckets = new List()..insertRange(0,buckets.length ~/ 20);

	// for each bucket, find a hash function that will map each key in it to an empty slot in bitVector.
	int bucketIndex = 0;

	for (_Bucket bucket in buckets) {
	if (bucket.itemIndexes.length == 0) // because buckets are sorted, we can finish here.
	break;
	int l = 1;
	bool loop = true;
	while (loop) {
	int j = 0;
	List<int> slots = new List(bucket.itemIndexes.length);
	for (int keyIndex in bucket.itemIndexes) {
	List<int> key = keyProvider.getKey(keyIndex);
	slots[j] = _hash(key, l) % keyAmount;
	if (bitVector.get(slots[j]))
	break;
	else {
	bitVector.set(slots[j]);
	j++;
	}
	}
	// if we fail to place all items in the bucket to the bitvector"s empty slots
	if (j < bucket.itemIndexes.length) {
	// we reset the occupied slots from bitvector.
	for (int k = 0; k < j; k++) {
	bitVector.clear(slots[k]);
	}
	// We reached the HASH_SEED_LIMIT.
	// We place a 0 for its hash index value to know later that bucket is left to secondary lookup.
	if (l == HASH_SEED_LIMIT) {
	failedBuckets.add(bucket);
	hashSeedArray[buckets[bucketIndex].id] = 0;
	loop = false;
	}

	} else { // sweet. We have found empty slots in bit vector for all keys of the bucket.
	hashSeedArray[buckets[bucketIndex].id] = l;
	loop = false;
	}
	l++;
	}
	bucketIndex++;
	}

	if (failedBuckets.length == 0) {
	// we are done.
	indexes.add(new _HashIndexes(keyAmount, buckets.length, hashSeedArray, new List(0)));
	return;
	}

	// we assign lower average per key per bucket after each iteration to avoid generation failure.
	if (averageKeysPerBucket > 1)
	averageKeysPerBucket--;

	// start calculation for failed buckets.
	int failedKeyCount = 0;
	for (_Bucket failedBucket in failedBuckets) {
	failedKeyCount += failedBucket.itemIndexes.length;
	}
	int failedBucketAmount = (failedKeyCount / averageKeysPerBucket).toInt();

	// this is a worst case scenario. No empty slot find for any buckets and we are already using buckets where bucket Amount>=keyAmount
	// In this case we double the bucket size with the hope that it will have better bucket-key distribution.
	if (failedKeyCount == keyAmount && averageKeysPerBucket <= 1) {
	averageKeysPerBucket = averageKeysPerBucket / 2;
	failedBucketAmount *= 2;
	}

	if (failedBucketAmount == 0)
	failedBucketAmount++;

	// this time we generate item keyAmount of Buckets
	List<_Bucket> nextLevelBuckets = new List(failedBucketAmount);
	for (int i = 0; i < failedBucketAmount; i++) {
	nextLevelBuckets[i] = new _Bucket(i);
	}

	// generate secondary buckets with item indexes.
	for (_Bucket largeHashIndexBucket in failedBuckets) {
	for (int itemIndex in largeHashIndexBucket.itemIndexes) {
	List<int> key = keyProvider.getKey(itemIndex);
	int secondaryBucketIndex = fingerPrint(key) % failedBucketAmount;
	nextLevelBuckets[secondaryBucketIndex].add(itemIndex);
	}
	}

	// sort buckets larger to smaller.
	nextLevelBuckets.sort((_Bucket a, _Bucket b) => a.compareTo(b));

	List<int> failedHashValues;
	int currentLevel = indexes.length;
	if (currentLevel == 0) {
	// if we are in the first level generate a bit vector with the size of zero indexes of the primary bit vector.
	failedHashValues = bitVector.zeroIntIndexes();
	} else {
	failedHashValues = new List(failedKeyCount);
	int k = 0;
	for (int i = 0; i < bitVector.size; i++) {
	if (!bitVector.get(i))
	failedHashValues[k++] = indexes[currentLevel - 1].failedIndexes[i];
	}
	}
	indexes.add(new _HashIndexes(keyAmount, buckets.length, hashSeedArray, failedHashValues));

	// recurse for failed buckets.
	calculateIndexes(nextLevelBuckets, failedKeyCount, indexes);
	}
	}

	/** A bucket that holds keys. It contains a small array for keys. */
	class _Bucket implements Comparable {
	int id;
	List<int> itemIndexes = new List();
	_Bucket(this.id);
	void add(int i) { itemIndexes.add(i); }
	int compareTo(_Bucket o) => itemIndexes.length.compareTo(o.itemIndexes.length);
	}

	class _FixedBitVector {
	List<int> _words;
	int size;

	// for fast modulo 32 calculation
	static final int mod32Mask = 0x1F;

	List<int> _setMasks = new List(32);
	List<int> _resetMasks = new List(32);
	List<int> _cutMasks = new List(32);

	_initialize(int initialCapacity) {
	size = initialCapacity;
	_words = new List()..insertRange(0, initialCapacity, 0);
	for (int i = 0; i < 32; i++) {
	_setMasks[i] = 0x1 << i;
	_resetMasks[i] = ~_setMasks[i];
	_cutMasks[i] = ~(0xfffffffe << i);
	}
	}

	/** Creates a fixed bit vector with capacity of [bitCount]. */
	_FixedBitVector.bitCount(int bitCount) {
	_initialize(bitCount);
	}

	bool get (int n) => (_words[n >> 6] & _setMasks[n & mod32Mask]) != 0;

	void set (int n) {
	_words[n >> 6] \|= _setMasks[n & mod32Mask];
	}

	void clear(int n) {
	_words[n >> 6] &= _resetMasks[n & mod32Mask];
	}

	/** creates an array containing 0 bit indexes. */
	List<int> zeroIntIndexes() {
	var zeroIndexes = new List<int>();
	int j = 0;
	for (int i = 0; i < size; i++) {
	if (!get(i))
	zeroIndexes.add(i);
	}
	return zeroIndexes;
	}

	}

	class KeyProvider {
	List<List<int>> list = new List();
	KeyProvider(this.list);

	KeyProvider.fromStrings(List<String> vals) {
	for(String s in vals) {
	list.add(s.charCodes());
	}
	}

	List<int> getKey(int index) => list[index];

	int keyAmount() => list.length;
	}

	void main() {
	var keys = ["elma","armut","turp","kiraz","karpuz", "kavun"];
	var hash = new Mphf.generate( new KeyProvider.fromStrings(keys));
	for(var key in keys) {
	print("for key [$key] hash value is: ${hash.get(key.charCodes())}");
	}
	}