Skip to content

Instantly share code, notes, and snippets.

@prateek
Created December 17, 2022 01:37
Show Gist options
  • Save prateek/3ea7d540f1cb751bad44f6cab5338996 to your computer and use it in GitHub Desktop.
Save prateek/3ea7d540f1cb751bad44f6cab5338996 to your computer and use it in GitHub Desktop.
TestKeysDistribution demonstrates why you have to be careful when picking hash functions while double hashing.
package main
import (
"fmt"
"hash/crc32"
"math"
"testing"
)
/*
TestKeysDistribution demonstrates why you have to be careful when picking
hash functions while double hashing.
❯ GO111MODULE=off go run . -test.v
=== RUN TestKeysDistribution
=== RUN TestKeysDistribution/different_hash_functions
=== RUN TestKeysDistribution/same_hash_functions
main.go:51: instance 0 used 8192 out 16384 hash slots
main.go:59: instance 0 had 61.035400 keys/slot which is not within 10% of 30.517578
main.go:51: instance 1 used 8192 out 16384 hash slots
main.go:59: instance 1 had 61.034912 keys/slot which is not within 10% of 30.517578
--- FAIL: TestKeysDistribution (0.18s)
--- PASS: TestKeysDistribution/different_hash_functions (0.09s)
--- FAIL: TestKeysDistribution/same_hash_functions (0.09s)
FAIL
exit status 1
*/
const (
_numTestKeys = 1000 * 1000
_numRedisInstances = 2
_numHashSlots = 16384
)
func TestKeysDistribution(t *testing.T) {
for _, tt := range []struct {
name string
pickInstanceHashFn func(key []byte) uint32
pickSlotHashFn func(key []byte) uint32
}{
{"different hash functions", crc32.ChecksumIEEE, _crc32Castagnoli},
{"same hash functions", crc32.ChecksumIEEE, crc32.ChecksumIEEE},
} {
t.Run(tt.name, func(t *testing.T) {
instanceDispersionCounts := make(map[int]*dispersionCounter)
for i := 0; i < _numRedisInstances; i++ {
instanceDispersionCounts[i] = newDispersionCounter(_numHashSlots)
}
for _, key := range _randomKeys {
i := int(tt.pickInstanceHashFn([]byte(key)) % _numRedisInstances)
slot := tt.pickSlotHashFn([]byte(key)) % _numHashSlots
instanceDispersionCounts[i].incr(slot)
}
// assert properties on the distribution
for i, instance := range instanceDispersionCounts {
summary := instance.summary()
// each slot in each instance must be used
if slotsUsed := int(summary.count); slotsUsed < _numHashSlots {
t.Errorf("instance %d used %d out %d hash slots", i, slotsUsed, _numHashSlots)
}
// if we have a uniform distribution, we expect to divide the keys roughly equally
expectedNumKeysPerSlot := float64(_numTestKeys) / float64(_numHashSlots) / float64(_numRedisInstances)
// assert we're within 10% of fully uniform distribution
if summary.mean < 0.9*expectedNumKeysPerSlot || summary.mean > 1.1*expectedNumKeysPerSlot {
t.Errorf("instance %d had %f keys/slot which is not within 10%% of %f", i, summary.mean, expectedNumKeysPerSlot)
}
}
})
}
}
type dispersionCounter struct {
numSlots uint32
counts map[uint32]int
}
func newDispersionCounter(numSlots uint32) *dispersionCounter {
return &dispersionCounter{
numSlots: numSlots,
counts: make(map[uint32]int, numSlots),
}
}
func (d *dispersionCounter) incr(slot uint32) {
if slot >= d.numSlots {
panic("slot >= numSlots")
}
d.counts[slot] = d.counts[slot] + 1
}
func (d *dispersionCounter) summary() summaryStats {
summary := summaryStats{
max: -1 * math.MaxFloat64,
min: math.MaxFloat64,
count: 0,
}
sum := 0.0
for _, count := range d.counts {
c := float64(count)
summary.max = math.Max(summary.max, c)
summary.min = math.Min(summary.min, c)
summary.count = summary.count + 1
sum += c
}
if summary.count > 0 {
summary.mean = sum / summary.count
}
return summary
}
type summaryStats struct {
count float64
mean float64
max float64
min float64
}
func newRandomKeys(num int) [][]byte {
ret := make([][]byte, 0, num)
for i := 0; i < num; i++ {
ret = append(ret, []byte(fmt.Sprintf("random-string-id-%d", i)))
}
return ret
}
var (
_randomKeys = newRandomKeys(_numTestKeys)
_castagnoliTable = crc32.MakeTable(crc32.Castagnoli)
_crc32Castagnoli = func(b []byte) uint32 {
return crc32.Checksum(b, _castagnoliTable)
}
)
func matchString(a, b string) (bool, error) {
return a == b, nil
}
func main() {
testSuite := []testing.InternalTest{
{
Name: "TestKeysDistribution",
F: TestKeysDistribution,
},
}
testing.Main(matchString, testSuite, nil, nil)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment