Skip to content

Instantly share code, notes, and snippets.

@coxsim
Created May 5, 2011 06:15
Show Gist options
  • Save coxsim/956621 to your computer and use it in GitHub Desktop.
Save coxsim/956621 to your computer and use it in GitHub Desktop.
Multi level dictionary to avoid allocations on the LOH
using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Runtime.ConstrainedExecution;
using System.Security;
using C5Lib = C5;
namespace Collections
{
public static class LargeDictionary
{
private const int LOH_THRESHOLD_IN_BYTES = 85000;
private static int DetermineMaximumDictionaryEntries()
{
// It turns out it's actually imposible (by intention) to not be able to determine the
// size of a struct. The best we can do is to assume the largest possible, and just
// accept the wasted space.
// (See http://stackoverflow.com/questions/3361986/how-to-check-the-number-of-bytes-consumed-by-my-structure#3362736)
//
const int dictEntrySizeInBytes = 32;
var realMaximumCount = (int)Math.Floor(LOH_THRESHOLD_IN_BYTES / (double)dictEntrySizeInBytes);
// find the next lowest prime to the number of entries
//
return LowerPrime(realMaximumCount);
}
/// <summary>Get the next prime that is strictly lower than <code>num</code></summary>
private static int LowerPrime(int num)
{
var nextLowestPrime = num-1;
while (!HashHelpers.IsPrime(nextLowestPrime))
nextLowestPrime--;
return nextLowestPrime;
}
public static readonly int MAX_DICTIONARY_SIZE = DetermineMaximumDictionaryEntries();
public static readonly int HALF_MAX_DICTIONARY_SIZE = (int)Math.Floor(MAX_DICTIONARY_SIZE / 2d);
public static readonly int PRIME_BELOW_HALF_MAX_DICTIONARY_SIZE = LowerPrime(HALF_MAX_DICTIONARY_SIZE);
public static readonly int PRIME_ABOVE_HALF_MAX_DICTIONARY_SIZE = HashHelpers.GetPrime(HALF_MAX_DICTIONARY_SIZE);
public static IDictionary<TKey, TValue> CreateDictionary<TKey, TValue>(int expectedCount)
{
return new ConsistentHashLargeDictionary<TKey, TValue>(expectedCount);
}
public static IDictionary<TKey, TValue> CreateDictionary<TKey, TValue>(int expectedCount, IEqualityComparer<TKey> equalityComparer)
{
return new ConsistentHashLargeDictionary<TKey, TValue>(expectedCount, equalityComparer);
}
}
/// <summary>
/// Dictionary that uses consistent hashing to maintain a set of sub-dictionaries, all of which
/// should never grow past the point where they would cause allocations on the large object heap.
/// </summary>
/// <typeparam name="TKey"></typeparam>
/// <typeparam name="TValue"></typeparam>
/// <remarks>
/// Consistent hashing is used to reduce the number of nodes to rehash when the number of
/// dictionaries needs to change.
///
/// All values are stored in buckets spread evenly (to begin with) at points around a circle.
/// The points on the circle are all the integers between <code>MIN_HASH</code> and
/// <code>MAX_HASH</code>. The hash-code of a key represents a point on a circle. The key's
/// value is stored in the closest previous bucket around the circle.
///
/// See http://en.wikipedia.org/wiki/Consistent_hashing
/// </remarks>
public class ConsistentHashLargeDictionary<TKey, TValue> : IDictionary<TKey, TValue>
{
private const int MIN_HASH = int.MinValue;
private const int MAX_HASH = int.MaxValue;
private readonly IEqualityComparer<TKey> equalityComparer;
private readonly C5Lib.TreeDictionary<int, IDictionary<TKey, TValue>> circle = new C5Lib.TreeDictionary<int, IDictionary<TKey, TValue>>();
private C5Lib.KeyValuePair<int, IDictionary<TKey, TValue>> firstNode;
public ConsistentHashLargeDictionary(int initialCapacity) : this(initialCapacity, EqualityComparer<TKey>.Default)
{
}
private static int TryGetCollectionCount<T>(IEnumerable<T> enumerable)
{
var collection = enumerable as ICollection<T>;
return (collection != null) ? collection.Count : 0;
}
public ConsistentHashLargeDictionary(IEnumerable<KeyValuePair<TKey, TValue>> dictionary, IEqualityComparer<TKey> equalityComparer)
: this(TryGetCollectionCount(dictionary), equalityComparer)
{
foreach(var item in dictionary)
{
Add(item.Key, item.Value);
}
}
public ConsistentHashLargeDictionary(int initialCapacity, IEqualityComparer<TKey> equalityComparer)
{
this.equalityComparer = equalityComparer;
var nodeCount = initialCapacity > 0 ? (int) Math.Ceiling(initialCapacity / (double)LargeDictionary.MAX_DICTIONARY_SIZE) :
1;
// split the circle into 'nodeCount' dictionaries spread equaly apart
//
// optimisation for small dictionaries: just create the single node at the minimum point
// on the circle
//
if (nodeCount == 1)
{
circle[MIN_HASH] = new Dictionary<TKey, TValue>(initialCapacity, equalityComparer);
}
else
{
// for larger dictionaries, create the points on the circle, but leave the creation
// of each dictionary until actually need to add to it
//
var hashGap = (MAX_HASH - (long)MIN_HASH)/nodeCount;
// need to use a long for the hash to avoid wrapping around on the last gap. It's safe
// to cast as has is always bounded by MIN_HASH and MAX_HASH which are ints.
//
for (long hash = MIN_HASH; hash < MAX_HASH; hash += hashGap)
{
circle[(int) hash] = null;
}
}
firstNode = circle.First();
}
private C5Lib.KeyValuePair<int, IDictionary<TKey, TValue>> GetNode(TKey key)
{
// optimisation for small dictionaries
//
if (circle.Count == 1)
return firstNode;
var hash = equalityComparer.GetHashCode(key);
return GetNode(hash);
}
/// <summary>
/// Get the node that may hold a key.
/// </summary>
/// <param name="key">key to locate node for</param>
/// <returns>node that might hold the key</returns>
/// <remarks>This will not create any nodes on the circle.</remarks>
private C5Lib.KeyValuePair<int, IDictionary<TKey, TValue>> GetNode(int hash)
{
// optimisation for small dictionaries
//
if (circle.Count == 1)
return firstNode;
C5Lib.KeyValuePair<int, IDictionary<TKey, TValue>> node;
if (!circle.TryWeakPredecessor(hash, out node))
{
// this means something very bad has happened as there should always be a node at the
// lowest point on the circle
//
throw new Exception("Root node is missing!");
}
return node;
}
/// <summary>
/// Get the dictionary that may hold a key.
/// </summary>
/// <param name="key">key to locate dictionary for</param>
/// <returns>dictionary that might hold the key</returns>
/// <remarks>This will not create any nodes on the circle.</remarks>
private IDictionary<TKey, TValue> GetDictionary(TKey key)
{
return GetNode(key).Value;
}
private void Add(TKey key, TValue value, bool adding)
{
var hash = equalityComparer.GetHashCode(key);
var node = GetNode(hash);
if (node.Value == null)
{
// this is the first node in the bucket
//
CreateSubDictionary(node.Key, key, value);
return;
}
var dictionary = node.Value;
// If we started as a 'small' dictionary then the capacity will not be the maximum.
// (For dictionaries with more than one node, i.e. 'large' dictionaries, the individual
// dictionaries will always be at maximum capacity).
//
// So, this checks if the Add will cause the smaller dictionary's capacity to go beyond
// the maximum size.
//
if (circle.Count == 1 && dictionary.Count == LargeDictionary.PRIME_ABOVE_HALF_MAX_DICTIONARY_SIZE)
{
// In this case, we need to take a hit and replace the dictionary at some point with one at
// a the maximum capacity. This is because the process of doubling the initial capacity
// will take us over the maximum at this point (count == prime above half max capacity).
//
var newDictionary = new Dictionary<TKey, TValue>(LargeDictionary.MAX_DICTIONARY_SIZE, equalityComparer);
foreach (var entry in dictionary)
newDictionary.Add(entry.Key, entry.Value);
circle[node.Key] = newDictionary;
if (node.Key == MIN_HASH)
firstNode.Value = newDictionary;
dictionary = newDictionary;
}
else
{
// We can assume here that the dictionary's capacity is at the maximum, either
// because it is part of a 'large' dictionary where all the sub dictionaries
// started off as maximum, or because it was once a 'small' dictionary that got
// replaced with a maximum capacity dictionary above.
//
// Now check if the Add would cause an already maximum capacity dictionary to be expanded.
//
// Check if size has been breached and
while (dictionary.Count >= LargeDictionary.MAX_DICTIONARY_SIZE)
{
// If so, keep splitting up the offending node until there's space. If the hash
// code is well distributed this should only happen once. However, if all of the
// nodes in the offending dictionary have hash codes in the lower half of the
// split the node will be split again.
//
var newNode = SplitNode(node);
if (hash >= newNode.Key)
{
node = newNode;
dictionary = node.Value;
}
}
}
if (adding)
dictionary.Add(key, value);
else
dictionary[key] = value;
Debug.Assert(dictionary.Count <= LargeDictionary.MAX_DICTIONARY_SIZE);
}
/// <summary>Create a new sub dictionary for a single entry.</summary>
private void CreateSubDictionary(int circleKey, TKey initialKey, TValue initialValue)
{
var newDictionary = new Dictionary<TKey, TValue>(LargeDictionary.MAX_DICTIONARY_SIZE, equalityComparer) { { initialKey, initialValue } };
circle[circleKey] = newDictionary;
if (circleKey == MIN_HASH)
firstNode.Value = newDictionary;
}
/// <summary>
/// Splits the dictionary at the given node.
/// </summary>
/// <returns>Dictionary at the new node</returns>
private C5Lib.KeyValuePair<int, IDictionary<TKey, TValue>> SplitNode(C5Lib.KeyValuePair<int, IDictionary<TKey, TValue>> node)
{
C5Lib.KeyValuePair<int, IDictionary<TKey, TValue>> nextNode;
var nextHash = circle.TrySuccessor(node.Key, out nextNode) ? nextNode.Key : MAX_HASH;
var midHash = node.Key + (int)((nextHash - (long)node.Key) / 2);
Debug.Assert(node.Key < nextHash && node.Key < midHash && midHash < nextHash);
if (circle.Contains(midHash))
{
throw new Exception("Run out of nodes. Hash-code is not evenly distributed enough.");
}
// now take (ideally half) the keys from the old node and insert them into the new node
//
var dictionary = node.Value;
var entriesToMove = dictionary.Where(kv => equalityComparer.GetHashCode(kv.Key) >= midHash).ToList();
var emptyDictionary = new Dictionary<TKey, TValue>(LargeDictionary.MAX_DICTIONARY_SIZE, equalityComparer);
IDictionary<TKey, TValue> dictionaryForMidNode;
if (entriesToMove.Count == dictionary.Count)
{
// optimisation where *all* of the keys need moving
//
circle[node.Key] = emptyDictionary;
circle[midHash] = dictionary;
dictionaryForMidNode = dictionary;
}
else
{
foreach (var entryToMove in entriesToMove)
{
var removed = dictionary.Remove(entryToMove.Key);
Debug.Assert(removed);
emptyDictionary.Add(entryToMove.Key, entryToMove.Value);
}
circle[midHash] = emptyDictionary;
dictionaryForMidNode = emptyDictionary;
}
return new C5Lib.KeyValuePair<int, IDictionary<TKey, TValue>>(midHash, dictionaryForMidNode);
}
#region Implementation of IDictionary<TKey,TValue>
public bool ContainsKey(TKey key)
{
// ReSharper disable CompareNonConstrainedGenericWithNull
Debug.Assert(key != null);
// ReSharper restore CompareNonConstrainedGenericWithNull
var dictionary = GetDictionary(key);
return dictionary != null && dictionary.ContainsKey(key);
}
public void Add(TKey key, TValue value)
{
Add(key, value, true);
}
public bool Remove(TKey key)
{
var dictionary = GetDictionary(key);
return dictionary != null && dictionary.Remove(key);
}
public bool TryGetValue(TKey key, out TValue value)
{
var dictionary = GetDictionary(key);
if (dictionary == null)
{
value = default(TValue);
return false;
}
return dictionary.TryGetValue(key, out value);
}
public TValue this[TKey key]
{
get
{
var dictionary = GetDictionary(key);
if (dictionary == null)
throw new KeyNotFoundException();
return dictionary[key];
}
set
{
Add(key, value, false);
}
}
public ICollection<TKey> Keys
{
get
{
return circle.Where(kvp => kvp.Value != null).SelectMany(kvp => kvp.Value.Keys).ToList();
}
}
public ICollection<TValue> Values
{
get
{
return circle.Where(kvp => kvp.Value != null).SelectMany(kvp => kvp.Value.Values).ToList();
}
}
#endregion
#region Implementation of IEnumerable
public IEnumerator<KeyValuePair<TKey, TValue>> GetEnumerator()
{
return circle.Where(kvp => kvp.Value != null).SelectMany(kvp => kvp.Value).GetEnumerator();
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
#endregion
#region Implementation of ICollection<KeyValuePair<TKey,TValue>>
public void Add(KeyValuePair<TKey, TValue> item)
{
Add(item.Key, item.Value);
}
public void Clear()
{
circle.Values.Where(dict => dict != null).ForEach(dict => dict.Clear());
}
public bool Contains(KeyValuePair<TKey, TValue> item)
{
return ContainsKey(item.Key);
}
public void CopyTo(KeyValuePair<TKey, TValue>[] array, int arrayIndex)
{
foreach (var dict in circle.Where(kvp => kvp.Value != null).Select(kvp => kvp.Value))
{
dict.CopyTo(array, arrayIndex);
arrayIndex += dict.Count;
}
}
public bool Remove(KeyValuePair<TKey, TValue> item)
{
return Remove(item.Key);
}
internal int PartitionCount
{
get { return circle.Count; }
}
public int Count
{
get { return circle.Where(kvp => kvp.Value != null).Sum(kvp => kvp.Value.Count); }
}
public bool IsReadOnly
{
get { return false; }
}
#endregion
}
// Copied directly from disassembled System.Collections.HashHelpers
internal static class HashHelpers
{
internal static readonly int[] PRIMES;
[SecuritySafeCritical, ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
internal static bool IsPrime(int candidate)
{
if ((candidate & 1) != 0)
{
var num = (int)Math.Sqrt(candidate);
for (var i = 3; i <= num; i += 2)
{
if (candidate % i == 0)
{
return false;
}
}
return true;
}
return candidate == 2;
}
[ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
internal static int GetPrime(int min)
{
if (min < 0)
{
throw new ArgumentException("Arg_HTCapacityOverflow");
}
foreach (var num in PRIMES.Where(num => num >= min))
{
return num;
}
for (var j = min | 1; j < 2147483647; j += 2)
{
if (IsPrime(j))
{
return j;
}
}
return min;
}
static HashHelpers()
{
PRIMES = new int[]
{
3,
7,
11,
17,
23,
29,
37,
47,
59,
71,
89,
107,
131,
163,
197,
239,
293,
353,
431,
521,
631,
761,
919,
1103,
1327,
1597,
1931,
2333,
2801,
3371,
4049,
4861,
5839,
7013,
8419,
10103,
12143,
14591,
17519,
21023,
25229,
30293,
36353,
43627,
52361,
62851,
75431,
90523,
108631,
130363,
156437,
187751,
225307,
270371,
324449,
389357,
467237,
560689,
672827,
807403,
968897,
1162687,
1395263,
1674319,
2009191,
2411033,
2893249,
3471899,
4166287,
4999559,
5999471,
7199369
};
}
}
}
@manoj-mittal1979
Copy link

Hi,
Do you have 64 bit implementation of LargeDictionary as well?

Thanks,
Manoj.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment