anatolebeuzon/README.md

## README.md

      
    Raw
  

              README.md
            
          
    This is an experimental tool to evaluate the quality of the estimations produced by the HyperLogLog algorithm described in the Flajolet et al. paper, with and without the large range correction (§4 of the paper).
This correction is only enabled for cardinalities beyond 143M, so any input below that will return the same result twice.
Usage

go mod tidy
go run main.go -count 3000000000
Sample results

$ go run main.go -card 150000000 # 150M
Without correction: 159441109 (inaccuracy: +6%)
With correction:    162475901 (inaccuracy: +8%)

$ go run main.go -card 500000000 # 500M
Without correction: 532421332 (inaccuracy: +6%)
With correction:    568430738 (inaccuracy: +14%)

$ go run main.go -card 1000000000 # 1G
Without correction: 1014882872 (inaccuracy: +1%)
With correction:    1157814863 (inaccuracy: +16%)

$ go run main.go -card 2000000000 # 2G
Without correction: 1908444304 (inaccuracy: -5%)
With correction:    2523750484 (inaccuracy: +26%)

$ go run main.go -card 2500000000 # 2.5G
Without correction: 2347386494 (inaccuracy: -6%)
With correction:    3396700455 (inaccuracy: +36%)

$ go run main.go -card 3000000000 # 3G
Without correction: 2766452874 (inaccuracy: -8%)
With correction:    4437335342 (inaccuracy: +48%)

$ go run main.go -card 4000000000 # 4G
Without correction: 3455803455 (inaccuracy: -14%)
With correction:    7012793615 (inaccuracy: +75%)

$ go run main.go -card 5000000000 # 5G
Without correction: 3948874477  (inaccuracy: -21%)
With correction:    10816841717 (inaccuracy: +116%)

$ go run main.go -card 6000000000 # 6G
Without correction: 4466485931  (inaccuracy: -26%)
With correction:    0           (inaccuracy: -100%) # exceeded the domain of the log function
HLL implementation

The DataDog/hyperloglog implementation is used, with a minor modification to be able to estimate with and without the large range correction:
 	} else if estimate > 1.0/30.0*exp32 {
-		// Large range correction
-		estimate = -exp32 * math.Log(1-estimate/exp32)
+		if withLargeRangeCorrection {
+			estimate = -exp32 * math.Log(1-estimate/exp32)
+		}
 	}
 	return uint64(estimate)
 }

  
## go.mod
module github.com/DataDog/experimental/users/anatole.beuzon/large-card-hll

go 1.17

require github.com/DataDog/hyperloglog v0.0.0-20200325135234-98c95166e316

require (
	github.com/DataDog/mmh3 v0.0.0-20210722141835-012dc69a9e49 // indirect
	github.com/dustin/randbo v0.0.0-20140428231429-7f1b564ca724 // indirect
)

## go.sum
github.com/DataDog/hyperloglog v0.0.0-20200325135234-98c95166e316 h1:CTR6ylcEcfvmr6xAJwLfKfud8EtIe5eBrVh5ClYj5ow=
github.com/DataDog/hyperloglog v0.0.0-20200325135234-98c95166e316/go.mod h1:hFPkswc42pKhRbeKDKXy05mRi7J1kJ2vMNbvd9erH0M=
github.com/DataDog/mmh3 v0.0.0-20210722141835-012dc69a9e49 h1:EbzDX8HPk5uE2FsJYxD74QmMw0/3CqSKhEr6teh0ncQ=
github.com/DataDog/mmh3 v0.0.0-20210722141835-012dc69a9e49/go.mod h1:SvsjzyJlSg0rKsqYgdcFxeEVflx3ZNAyFfkUHP0TxXg=
github.com/dustin/randbo v0.0.0-20140428231429-7f1b564ca724 h1:1/c0u68+2LRI+XSpduQpV9BnKx1k1P6GTb3MVxCE3w4=
github.com/dustin/randbo v0.0.0-20140428231429-7f1b564ca724/go.mod h1:pTiKQhUCcxt2eQMAnv48oc5nAsmelPm573z44h6PSXc=

## main.go
package main

import (
	"flag"
	"fmt"
	"math"
	"math/rand"
	"os"
	"runtime"
	"sync"

	hll "github.com/DataDog/hyperloglog"
)

func main() {
	var target int
	flag.IntVar(&target, "card", 0, "target cardinality")
	flag.Parse()
	if target == 0 {
		flag.PrintDefaults()
		os.Exit(1)
	}

	// Map
	procs := runtime.GOMAXPROCS(0)
	wg := sync.WaitGroup{}
	counters := make([]*hll.HyperLogLog, procs)
	for i := 0; i < procs; i++ {
		wg.Add(1)
		go func(i int) {
			counters[i] = genCounts(target/procs, i, i == 0)
			wg.Done()
		}(i)
	}
	wg.Wait()

	// Reduce
	finalCounter := newCounter()
	for _, c := range counters {
		err := finalCounter.Merge(c)
		if err != nil {
			panic(err)
		}
	}

	resWithoutCorrection := count(finalCounter, false)
	resWithCorrection := count(finalCounter, true)
	fmt.Printf("Without correction:\t%d\t(inaccuracy: %+.0f%%)\n", resWithoutCorrection, pctDist(target, resWithoutCorrection))
	fmt.Printf("With correction:\t%d\t(inaccuracy: %+.0f%%)\n", resWithCorrection, pctDist(target, resWithCorrection))
}

func newCounter() *hll.HyperLogLog {
	const power = 10 // will use 2^10 registers
	c, err := hll.New(uint(1) << power)
	if err != nil {
		panic(err)
	}
	return c
}

// genCounts will generate n random numbers and add them to a counter, then return that counter.
func genCounts(n int, seed int, printProgress bool) *hll.HyperLogLog {
	r := rand.New(rand.NewSource(int64(seed)))
	c := newCounter()
	for i := 0; i < n; i++ {
		if printProgress && i%(n/100) == 0 {
			fmt.Printf("%d%%\r", 100*i/n)
		}

		// We MUST use 64-bits inputs so that the domain of the hash function (2**64
		// possibilities) is larger than its output domain (2**32), to properly model
		// a real application.
		//
		// Otherwise with hll.Murmur32(r.Uint32()) there would be virtually no hash
		// collisions, which is not realistic.
		c.Add(hll.Murmur64(r.Uint64()))
	}
	return c
}

func pctDist(target int, actual uint64) float64 {
	return 100 * float64(int64(actual)-int64(target)) / float64(target)
}

// Same as github.com/DataDog/hyperloglog's (*hll.HyperLogLog).Count(), but with an extra
// argument to be able to toggle the large range correction on/off.
func count(h *hll.HyperLogLog, withLargeRangeCorrection bool) uint64 {
	exp32 := math.Pow(2, 32)

	sum := 0.0
	m := float64(h.M)
	for _, val := range h.Registers {
		sum += 1.0 / math.Pow(2.0, float64(val))
	}
	estimate := h.Alpha * m * m / sum
	if estimate <= 5.0/2.0*m {
		// Small range correction
		v := 0
		for _, r := range h.Registers {
			if r == 0 {
				v++
			}
		}
		if v > 0 {
			estimate = m * math.Log(m/float64(v))
		}
	} else if estimate > 1.0/30.0*exp32 {
		if withLargeRangeCorrection {
			estimate = -exp32 * math.Log(1-estimate/exp32)
		}
	}
	return uint64(estimate)
}
	module github.com/DataDog/experimental/users/anatole.beuzon/large-card-hll

	go 1.17

	require github.com/DataDog/hyperloglog v0.0.0-20200325135234-98c95166e316

	require (
	github.com/DataDog/mmh3 v0.0.0-20210722141835-012dc69a9e49 // indirect
	github.com/dustin/randbo v0.0.0-20140428231429-7f1b564ca724 // indirect
	)
	github.com/DataDog/hyperloglog v0.0.0-20200325135234-98c95166e316 h1:CTR6ylcEcfvmr6xAJwLfKfud8EtIe5eBrVh5ClYj5ow=
	github.com/DataDog/hyperloglog v0.0.0-20200325135234-98c95166e316/go.mod h1:hFPkswc42pKhRbeKDKXy05mRi7J1kJ2vMNbvd9erH0M=
	github.com/DataDog/mmh3 v0.0.0-20210722141835-012dc69a9e49 h1:EbzDX8HPk5uE2FsJYxD74QmMw0/3CqSKhEr6teh0ncQ=
	github.com/DataDog/mmh3 v0.0.0-20210722141835-012dc69a9e49/go.mod h1:SvsjzyJlSg0rKsqYgdcFxeEVflx3ZNAyFfkUHP0TxXg=
	github.com/dustin/randbo v0.0.0-20140428231429-7f1b564ca724 h1:1/c0u68+2LRI+XSpduQpV9BnKx1k1P6GTb3MVxCE3w4=
	github.com/dustin/randbo v0.0.0-20140428231429-7f1b564ca724/go.mod h1:pTiKQhUCcxt2eQMAnv48oc5nAsmelPm573z44h6PSXc=
	package main

	import (
	"flag"
	"fmt"
	"math"
	"math/rand"
	"os"
	"runtime"
	"sync"

	hll "github.com/DataDog/hyperloglog"
	)

	func main() {
	var target int
	flag.IntVar(&target, "card", 0, "target cardinality")
	flag.Parse()
	if target == 0 {
	flag.PrintDefaults()
	os.Exit(1)
	}

	// Map
	procs := runtime.GOMAXPROCS(0)
	wg := sync.WaitGroup{}
	counters := make([]*hll.HyperLogLog, procs)
	for i := 0; i < procs; i++ {
	wg.Add(1)
	go func(i int) {
	counters[i] = genCounts(target/procs, i, i == 0)
	wg.Done()
	}(i)
	}
	wg.Wait()

	// Reduce
	finalCounter := newCounter()
	for _, c := range counters {
	err := finalCounter.Merge(c)
	if err != nil {
	panic(err)
	}
	}

	resWithoutCorrection := count(finalCounter, false)
	resWithCorrection := count(finalCounter, true)
	fmt.Printf("Without correction:\t%d\t(inaccuracy: %+.0f%%)\n", resWithoutCorrection, pctDist(target, resWithoutCorrection))
	fmt.Printf("With correction:\t%d\t(inaccuracy: %+.0f%%)\n", resWithCorrection, pctDist(target, resWithCorrection))
	}

	func newCounter() *hll.HyperLogLog {
	const power = 10 // will use 2^10 registers
	c, err := hll.New(uint(1) << power)
	if err != nil {
	panic(err)
	}
	return c
	}

	// genCounts will generate n random numbers and add them to a counter, then return that counter.
	func genCounts(n int, seed int, printProgress bool) *hll.HyperLogLog {
	r := rand.New(rand.NewSource(int64(seed)))
	c := newCounter()
	for i := 0; i < n; i++ {
	if printProgress && i%(n/100) == 0 {
	fmt.Printf("%d%%\r", 100*i/n)
	}

	// We MUST use 64-bits inputs so that the domain of the hash function (2**64
	// possibilities) is larger than its output domain (2**32), to properly model
	// a real application.
	//
	// Otherwise with hll.Murmur32(r.Uint32()) there would be virtually no hash
	// collisions, which is not realistic.
	c.Add(hll.Murmur64(r.Uint64()))
	}
	return c
	}

	func pctDist(target int, actual uint64) float64 {
	return 100 * float64(int64(actual)-int64(target)) / float64(target)
	}

	// Same as github.com/DataDog/hyperloglog's (*hll.HyperLogLog).Count(), but with an extra
	// argument to be able to toggle the large range correction on/off.
	func count(h *hll.HyperLogLog, withLargeRangeCorrection bool) uint64 {
	exp32 := math.Pow(2, 32)

	sum := 0.0
	m := float64(h.M)
	for _, val := range h.Registers {
	sum += 1.0 / math.Pow(2.0, float64(val))
	}
	estimate := h.Alpha * m * m / sum
	if estimate <= 5.0/2.0*m {
	// Small range correction
	v := 0
	for _, r := range h.Registers {
	if r == 0 {
	v++
	}
	}
	if v > 0 {
	estimate = m * math.Log(m/float64(v))
	}
	} else if estimate > 1.0/30.0*exp32 {
	if withLargeRangeCorrection {
	estimate = -exp32 * math.Log(1-estimate/exp32)
	}
	}
	return uint64(estimate)
	}