Skip to content

Instantly share code, notes, and snippets.

@musoftware
Last active August 16, 2017 17:33
Show Gist options
  • Save musoftware/114af452665309c7070aba4aa82447c1 to your computer and use it in GitHub Desktop.
Save musoftware/114af452665309c7070aba4aa82447c1 to your computer and use it in GitHub Desktop.
C# BigHashSet Class Greater than 2Gb.
public class BigHashSet<T> : IEnumerable<T>
{
private const ulong StartSize = 1 << 19;
private BigArray<ulong> _hashBuckets;
private ulong _hashCode;
private BigArray<Slot> _slots;
public ulong Count { get; private set; }
public ulong Length {get{
return _hashBuckets.Length;
}}
public ulong DuplicateAddAttempts { get; private set; }
public ulong Resizes { get; private set; }
public ulong LastFindItemComparisons { get; private set; }
public float GrowthDelta = 2.0f;
public bool IsDirty {
get{
return Count > GetTrueCount() || Count < _hashBuckets.Length;
}
}
public T this[ulong index]
{
get
{
if (index > Count)
throw new Exception("Getter: Index out of bounds " + index + " must be less than " + Count);
return _slots[index].value;
}
}
public BigHashSet(ulong size = 0)
{
Initialize(size);
}
public BigHashSet(IEnumerable<T> collection)
{
if (collection == null)
Initialize();
else
{
var coll = collection as ICollection<T>;
if (coll == null)
{
Initialize();
return;
}
Initialize((ulong) coll.Count);
foreach (var item in coll)
Add(item);
}
}
IEnumerator<T> IEnumerable<T>.GetEnumerator()
{
return new Enumerator(this);
}
IEnumerator IEnumerable.GetEnumerator()
{
return new Enumerator(this);
}
public Enumerator GetEnumerator()
{
return new Enumerator(this);
}
public bool Add(T item)
{
return Insert(item);
}
public bool AddRange(IEnumerable<T> collection)
{
var coll = collection as ICollection<T>;
if (coll == null)
return false;
foreach (var item in coll)
Add(item);
return true;
}
public bool Contains(T item)
{
return FindItem(item) != ulong.MaxValue;
}
public bool ContainsAllElements(IEnumerable<T> elemcol)
{
return elemcol.All(Contains);
}
public void Clear()
{
if (Count == 0) return;
Initialize();
}
public void Release()
{
if (Count == 0) return;
Clear();
Compact();
}
public void Compact()
{
if (Count == 0) return;
var tc = GetTrueCount();
var tset = new BigHashSet<T>(tc);
for (ulong i = 0; i < tc; i++)
if (_slots[i].hashCode > 0)
tset.Add(_slots[i].value);
Count = tc;
_slots = tset._slots;
_hashBuckets = tset._hashBuckets;
}
private bool Insert(T item)
{
if (ItemExistsGenerateHashCode(item) == ulong.MaxValue)
{
try
{
if (Count >= _slots.Length)
Resize();
}
catch (Exception e)
{
throw new Exception("Error resizing hash bucket list " + e.Message);
}
var hashPos = _hashCode % _hashBuckets.Length;
var nextPos = _hashBuckets[hashPos];
_hashBuckets[hashPos] = Count;
var news = new Slot();
news.next = nextPos;
news.value = item;
news.hashCode = _hashCode;
_slots[Count] = news;
Count++;
return true;
}
DuplicateAddAttempts++;
return false;
}
private void Resize(ulong newsize = 0)
{
Resizes++;
if (newsize == 0)
newsize = GetNewSize();
var newhashebuckets = new BigArray<ulong>(newsize);
var newslots = new BigArray<Slot>(newsize);
for (ulong i = 0; i < _slots.Length; i++)
newslots[i] = _slots[i];
for (ulong i = 0; i < newhashebuckets.Length; i++)
newhashebuckets[i] = ulong.MaxValue;
for (ulong i = 0; i < Count; i++)
{
var pos = newslots[i].hashCode % newsize;
var prevpos = newhashebuckets[pos];
newhashebuckets[pos] = i;
if (prevpos != ulong.MaxValue)
{
var news = new Slot();
news.next = prevpos;
news.value = newslots[i].value;
news.hashCode = newslots[i].hashCode;
newslots[i] = news;
}
}
_hashBuckets = newhashebuckets;
_slots = newslots;
}
private ulong GetNewSize()
{
return (ulong) (Count * GrowthDelta);
}
public ulong FindItem(T item)
{
var hashCode = (uint) item.GetHashCode();
LastFindItemComparisons = 0;
for (var position = _hashBuckets[hashCode % _hashBuckets.Length]; position != ulong.MaxValue; position = _slots[position].next)
{
LastFindItemComparisons++;
if (_slots[position].hashCode == hashCode && _slots[position].value.Equals(item))
return position;
}
return ulong.MaxValue;
}
private ulong ItemExistsGenerateHashCode(T item)
{
_hashCode = (uint) item.GetHashCode();
for (var position = _hashBuckets[_hashCode % _hashBuckets.Length]; position != ulong.MaxValue; position = _slots[position].next)
if (_slots[position].hashCode == _hashCode && _slots[position].value.Equals(item))
return position;
return ulong.MaxValue;
}
private void Initialize(ulong size = 0)
{
if (size == 0)
size = StartSize;
_hashBuckets = new BigArray<ulong>(size);
_slots = new BigArray<Slot>(size);
Count = 0;
for (ulong i = 0; i < _hashBuckets.Length; i++)
_hashBuckets[i] = ulong.MaxValue;
}
public void CopyTo(T[] array)
{
CopyTo(array, 0, Count);
}
public void CopyTo(T[] array, ulong arrayIndex, ulong count)
{
ulong Copied = 0;
for (ulong i = 0; i < Count && Copied < count; i++)
if (_slots[i].hashCode > 0)
array[arrayIndex + Copied++] = _slots[i].value;
}
private ulong GetTrueCount()
{
ulong tCount = 0;
for (ulong i = 0; i < Count; i++)
if (_slots[i].hashCode > 0)
tCount++;
return tCount;
}
public T[] ToArray()
{
var newArray = new T[GetTrueCount()];
CopyTo(newArray);
return newArray;
}
private struct Slot
{
public ulong hashCode;
public ulong next;
public T value;
}
[Serializable]
public struct Enumerator : IEnumerator<T>
{
private readonly BigHashSet<T> set;
private ulong index;
public T Current { get; private set; }
object IEnumerator.Current
{
get
{
if (index == 0 || index == set.Count + 1)
throw new InvalidOperationException("Enumerator out of range: " + index);
return Current;
}
}
internal Enumerator(BigHashSet<T> set)
{
this.set = set;
index = 0;
Current = default(T);
}
public void Dispose()
{
}
public bool MoveNext()
{
if (index < set.Count)
{
Current = set._slots[index].value;
index++;
return true;
}
index = set.Count + 1;
Current = default(T);
return false;
}
void IEnumerator.Reset()
{
index = 0;
Current = default(T);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment