Skip to content

Instantly share code, notes, and snippets.

@sixeyed
Last active August 29, 2015 14:05
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sixeyed/5a1a9edac96132ade4ec to your computer and use it in GitHub Desktop.
Save sixeyed/5a1a9edac96132ade4ec to your computer and use it in GitHub Desktop.
MD5 hash distribution in C# - allocating to 1/200 for 0.5% splits
//LinqPad
File.Delete("c:\\md5-list.txt");
var hashes = new List<string>();
var bytes = new byte[16];
using (var rng = new RNGCryptoServiceProvider())
{
for (int i=0; i<500000; i++)
{
rng.GetBytes(bytes);
var md5 = BitConverter.ToString(bytes).Replace("-", "").ToLower();
hashes.Add(md5);
}
}
File.WriteAllLines("c:\\md5-list.txt", hashes.ToArray());
// from:
// http://www.codeproject.com/Articles/21508/Distributed-Caching-Using-a-Hash-Algorithm
public class KeyDistributor<TKey>
{
private int _numBuckets;
private int _pageSize;
private SHA1 _sha;
public int CalculateBucketIndex(TKey key)
{
if (_numBuckets == 1)
{
return 0;
}
// Create a SHA1 hash of the provided key
byte[] result;
byte[] data = BitConverter.GetBytes(key.GetHashCode());
result = _sha.ComputeHash(data);
// Split the hash into 4 integer numbers
uint n1 = BitConverter.ToUInt32(result, 0);
uint n2 = BitConverter.ToUInt32(result, 4);
uint n3 = BitConverter.ToUInt32(result, 8);
uint n4 = BitConverter.ToUInt32(result, 12);
uint n5 = BitConverter.ToUInt32(result, 16);
// Apply XOR to derive a combined number
uint index = n1 ^ n2 ^ n3 ^ n4 ^ n5;
// Calculate the bucket index
int bucketIndex = Convert.ToInt32(Math.Floor((double)(index / _pageSize)));
return bucketIndex;
}
public KeyDistributor(int numBuckets)
{
_numBuckets = numBuckets;
if (_numBuckets > 1)
{
// Based on the number of buckets calculate the page size.
// It will be to link a given key to a specific bucket
_pageSize = Convert.ToInt32
(Math.Ceiling((double)(uint.MaxValue / numBuckets)));
_sha = new SHA1Managed();
}
}
}
void Main()
{
var hashes = new List<string>(File.ReadAllLines("c:\\md5-list.txt"));
int bucketCount = 200;
//init buckets:
var buckets = new int[bucketCount];
for (int i=0; i<bucketCount; i++)
{
buckets[i] = 0;
}
var distributor = new KeyDistributor<string>(bucketCount);
//distribute hashes:
foreach (var hash in hashes)
{
var bucketIndex = distributor.CalculateBucketIndex(hash);
buckets[bucketIndex] = buckets[bucketIndex]+1;
}
//show distribution
for(int i=0; i<bucketCount; i++)
{
string.Format("Bucket: {0}, count: {1}", i, buckets[i]).Dump();
}
//pick some samples:
string.Format("Hash: {0}, bucket: {1}", hashes[501], distributor.CalculateBucketIndex(hashes[501])).Dump();
string.Format("Hash: {0}, bucket: {1}", hashes[864], distributor.CalculateBucketIndex(hashes[864])).Dump();
string.Format("Hash: {0}, bucket: {1}", hashes[2541], distributor.CalculateBucketIndex(hashes[2541])).Dump();
string.Format("Hash: {0}, bucket: {1}", hashes[10876], distributor.CalculateBucketIndex(hashes[10876])).Dump();
string.Format("Hash: {0}, bucket: {1}", hashes[450010], distributor.CalculateBucketIndex(hashes[450010])).Dump();
}
@sixeyed
Copy link
Author

sixeyed commented Aug 8, 2014

Sample use of Ivan Latunov's KeyDistributor to distribute 500K MD5 hashes among 200 buckets. Allows allocation of a single hash to a bucket which is 0.5% of the population.

Run GenerateMD5Hashes.linq in LinqPad to generate the hashes and write them to a file (~2 seconds); then repeatedly run TestDistribution.linq (~1 second per run).

You should find the buckets have ~2,500 entries in each, and on repeated runs the output should be the same.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment