Skip to content

Instantly share code, notes, and snippets.

@anatolebeuzon
Last active February 13, 2022 11:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anatolebeuzon/e25d0ddf9884e294e55a1aa46f9c9269 to your computer and use it in GitHub Desktop.
Save anatolebeuzon/e25d0ddf9884e294e55a1aa46f9c9269 to your computer and use it in GitHub Desktop.
HyperLogLog large range correction bias

This is an experimental tool to evaluate the quality of the estimations produced by the HyperLogLog algorithm described in the Flajolet et al. paper, with and without the large range correction (§4 of the paper).

This correction is only enabled for cardinalities beyond 143M, so any input below that will return the same result twice.

Usage

go mod tidy
go run main.go -count 3000000000

Sample results

$ go run main.go -card 150000000 # 150M
Without correction: 159441109 (inaccuracy: +6%)
With correction:    162475901 (inaccuracy: +8%)

$ go run main.go -card 500000000 # 500M
Without correction: 532421332 (inaccuracy: +6%)
With correction:    568430738 (inaccuracy: +14%)

$ go run main.go -card 1000000000 # 1G
Without correction: 1014882872 (inaccuracy: +1%)
With correction:    1157814863 (inaccuracy: +16%)

$ go run main.go -card 2000000000 # 2G
Without correction: 1908444304 (inaccuracy: -5%)
With correction:    2523750484 (inaccuracy: +26%)

$ go run main.go -card 2500000000 # 2.5G
Without correction: 2347386494 (inaccuracy: -6%)
With correction:    3396700455 (inaccuracy: +36%)

$ go run main.go -card 3000000000 # 3G
Without correction: 2766452874 (inaccuracy: -8%)
With correction:    4437335342 (inaccuracy: +48%)

$ go run main.go -card 4000000000 # 4G
Without correction: 3455803455 (inaccuracy: -14%)
With correction:    7012793615 (inaccuracy: +75%)

$ go run main.go -card 5000000000 # 5G
Without correction: 3948874477  (inaccuracy: -21%)
With correction:    10816841717 (inaccuracy: +116%)

$ go run main.go -card 6000000000 # 6G
Without correction: 4466485931  (inaccuracy: -26%)
With correction:    0           (inaccuracy: -100%) # exceeded the domain of the log function

HLL implementation

The DataDog/hyperloglog implementation is used, with a minor modification to be able to estimate with and without the large range correction:

 	} else if estimate > 1.0/30.0*exp32 {
-		// Large range correction
-		estimate = -exp32 * math.Log(1-estimate/exp32)
+		if withLargeRangeCorrection {
+			estimate = -exp32 * math.Log(1-estimate/exp32)
+		}
 	}
 	return uint64(estimate)
 }
module github.com/DataDog/experimental/users/anatole.beuzon/large-card-hll
go 1.17
require github.com/DataDog/hyperloglog v0.0.0-20200325135234-98c95166e316
require (
github.com/DataDog/mmh3 v0.0.0-20210722141835-012dc69a9e49 // indirect
github.com/dustin/randbo v0.0.0-20140428231429-7f1b564ca724 // indirect
)
github.com/DataDog/hyperloglog v0.0.0-20200325135234-98c95166e316 h1:CTR6ylcEcfvmr6xAJwLfKfud8EtIe5eBrVh5ClYj5ow=
github.com/DataDog/hyperloglog v0.0.0-20200325135234-98c95166e316/go.mod h1:hFPkswc42pKhRbeKDKXy05mRi7J1kJ2vMNbvd9erH0M=
github.com/DataDog/mmh3 v0.0.0-20210722141835-012dc69a9e49 h1:EbzDX8HPk5uE2FsJYxD74QmMw0/3CqSKhEr6teh0ncQ=
github.com/DataDog/mmh3 v0.0.0-20210722141835-012dc69a9e49/go.mod h1:SvsjzyJlSg0rKsqYgdcFxeEVflx3ZNAyFfkUHP0TxXg=
github.com/dustin/randbo v0.0.0-20140428231429-7f1b564ca724 h1:1/c0u68+2LRI+XSpduQpV9BnKx1k1P6GTb3MVxCE3w4=
github.com/dustin/randbo v0.0.0-20140428231429-7f1b564ca724/go.mod h1:pTiKQhUCcxt2eQMAnv48oc5nAsmelPm573z44h6PSXc=
package main
import (
"flag"
"fmt"
"math"
"math/rand"
"os"
"runtime"
"sync"
hll "github.com/DataDog/hyperloglog"
)
func main() {
var target int
flag.IntVar(&target, "card", 0, "target cardinality")
flag.Parse()
if target == 0 {
flag.PrintDefaults()
os.Exit(1)
}
// Map
procs := runtime.GOMAXPROCS(0)
wg := sync.WaitGroup{}
counters := make([]*hll.HyperLogLog, procs)
for i := 0; i < procs; i++ {
wg.Add(1)
go func(i int) {
counters[i] = genCounts(target/procs, i, i == 0)
wg.Done()
}(i)
}
wg.Wait()
// Reduce
finalCounter := newCounter()
for _, c := range counters {
err := finalCounter.Merge(c)
if err != nil {
panic(err)
}
}
resWithoutCorrection := count(finalCounter, false)
resWithCorrection := count(finalCounter, true)
fmt.Printf("Without correction:\t%d\t(inaccuracy: %+.0f%%)\n", resWithoutCorrection, pctDist(target, resWithoutCorrection))
fmt.Printf("With correction:\t%d\t(inaccuracy: %+.0f%%)\n", resWithCorrection, pctDist(target, resWithCorrection))
}
func newCounter() *hll.HyperLogLog {
const power = 10 // will use 2^10 registers
c, err := hll.New(uint(1) << power)
if err != nil {
panic(err)
}
return c
}
// genCounts will generate n random numbers and add them to a counter, then return that counter.
func genCounts(n int, seed int, printProgress bool) *hll.HyperLogLog {
r := rand.New(rand.NewSource(int64(seed)))
c := newCounter()
for i := 0; i < n; i++ {
if printProgress && i%(n/100) == 0 {
fmt.Printf("%d%%\r", 100*i/n)
}
// We MUST use 64-bits inputs so that the domain of the hash function (2**64
// possibilities) is larger than its output domain (2**32), to properly model
// a real application.
//
// Otherwise with hll.Murmur32(r.Uint32()) there would be virtually no hash
// collisions, which is not realistic.
c.Add(hll.Murmur64(r.Uint64()))
}
return c
}
func pctDist(target int, actual uint64) float64 {
return 100 * float64(int64(actual)-int64(target)) / float64(target)
}
// Same as github.com/DataDog/hyperloglog's (*hll.HyperLogLog).Count(), but with an extra
// argument to be able to toggle the large range correction on/off.
func count(h *hll.HyperLogLog, withLargeRangeCorrection bool) uint64 {
exp32 := math.Pow(2, 32)
sum := 0.0
m := float64(h.M)
for _, val := range h.Registers {
sum += 1.0 / math.Pow(2.0, float64(val))
}
estimate := h.Alpha * m * m / sum
if estimate <= 5.0/2.0*m {
// Small range correction
v := 0
for _, r := range h.Registers {
if r == 0 {
v++
}
}
if v > 0 {
estimate = m * math.Log(m/float64(v))
}
} else if estimate > 1.0/30.0*exp32 {
if withLargeRangeCorrection {
estimate = -exp32 * math.Log(1-estimate/exp32)
}
}
return uint64(estimate)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment