Skip to content

Instantly share code, notes, and snippets.

@jdp
Last active December 20, 2015 07:48
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jdp/6095563 to your computer and use it in GitHub Desktop.
Save jdp/6095563 to your computer and use it in GitHub Desktop.
LogLog and HyperLogLog estimators in Golang
package main
import (
// "encoding/binary"
"crypto/rand"
"fmt"
"hash"
"hash/fnv"
"io"
"io/ioutil"
"math"
"strings"
)
var (
lsbp32bp = [...]uint32{0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17,
4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}
lsbp64bp = [...]uint64{0, 1, 2, 7, 3, 13, 8, 19, 4, 25, 14, 28, 9, 34, 20,
40, 5, 17, 26, 38, 15, 46, 29, 48, 10, 31, 35, 54, 21, 50, 41, 57, 63,
6, 12, 18, 24, 27, 33, 39, 16, 37, 45, 47, 30, 53, 49, 56, 62, 11, 23,
32, 36, 44, 52, 55, 61, 22, 43, 51, 60, 42, 59, 58}
)
type estimator interface {
Write([]byte) (int, error)
Get() float64
}
func rank(n uint64) uint64 {
if n == 0 {
return 64
}
var p uint64
for (n>>p)&1 == 0 {
p += 1
}
return p
}
func lsbp32(v uint32) uint32 {
return lsbp32bp[((v&-v)*0x077cb531)>>27]
}
func lsbp64(v uint64) uint64 {
return lsbp64bp[((v&-v)*0x218a392cd3d5dbf)>>58]
}
type LLEstimator struct {
k, m uint64
hash hash.Hash64
buckets []uint64
}
func NewLLEstimator(k uint64) (lle *LLEstimator) {
m := uint64(math.Pow(2, float64(k)))
buckets := make([]uint64, m)
h := fnv.New64a()
lle = &LLEstimator{k: k, m: m, hash: h, buckets: buckets}
return
}
func (e *LLEstimator) alpha() (alpha float64) {
switch e.m {
case 16:
alpha = 0.75207
case 32:
alpha = 0.77308
case 64:
alpha = 0.78356
default:
alpha = 0.79402
}
return
}
func (e *LLEstimator) Write(p []byte) (n int, err error) {
if n, err = e.hash.Write(p); err != nil {
return
}
hashSum := e.hash.Sum64()
e.hash.Reset()
j := hashSum & (e.m - 1)
r := rank(hashSum >> e.k)
if r > e.buckets[j] {
e.buckets[j] = r
}
return
}
func (e *LLEstimator) Get() float64 {
var bucketval float64
for b := range e.buckets {
bucketval += float64(e.buckets[b])
}
bucketval /= float64(e.m)
return e.alpha() * float64(e.m) * math.Pow(2, bucketval)
}
type HLLEstimator struct {
b uint64
m float64
hash hash.Hash64
buckets []uint64
}
func NewHLLEstimator(b uint64) *HLLEstimator {
m := math.Pow(2, float64(b))
buckets := make([]uint64, uint(m))
h := fnv.New64a()
return &HLLEstimator{b: b, m: m, buckets: buckets, hash: h}
}
func (e *HLLEstimator) alpha() (alpha float64) {
switch e.m {
case 16.0:
alpha = 0.673
case 32.0:
alpha = 0.697
case 64.0:
alpha = 0.709
default:
alpha = 0.7213 / (1 + 1.079/e.m)
}
return
}
func (e *HLLEstimator) rho(v uint64) (r uint64) {
m := uint64(math.Pow(2, float64(e.b)-1))
lsbp := lsbp64(v) + 1
if m < lsbp {
r = m
} else {
r = lsbp
}
return
}
func (e *HLLEstimator) Write(p []byte) (n int, err error) {
if n, err = e.hash.Write(p); err != nil {
return
}
hashSum := e.hash.Sum64()
e.hash.Reset()
i := hashSum & ((1 << e.b) - 1)
r := e.rho(hashSum >> e.b)
if r > e.buckets[i] {
e.buckets[i] = r
}
return
}
func (e *HLLEstimator) Get() (dv float64) {
var bucketsum float64
for i := range e.buckets {
bucketsum += math.Pow(2, -float64(e.buckets[i]))
}
est := e.alpha() * math.Pow(e.m, 2) / bucketsum
if est < 5.0/2*e.m {
var v int
for i := range e.buckets {
if e.buckets[i] == 0 {
v += 1
}
}
if v == 0 {
dv = est
} else {
dv = e.m * math.Log(e.m/float64(v))
}
} else if est <= (1.0 / 30 * math.Pow(2, 32)) {
dv = est
} else if est > (1.0 / 30 * math.Pow(2, 32)) {
dv = -math.Pow(2, 32) * math.Log(1.0-est/math.Pow(2, 32))
}
return
}
func getRandomBytes() []byte {
c := 16
b := make([]byte, c)
if _, err := io.ReadFull(rand.Reader, b); err != nil {
fmt.Println("error:", err)
return b
}
return b
}
func estimateDiscreteTimestamps(path string) (real int, est float64, err error) {
content, err := ioutil.ReadFile(path)
if err != nil {
return
}
lines := strings.Split(string(content), "\n")
uniques := map[string]bool{}
lle := NewHLLEstimator(6)
for i := range lines {
uniques[lines[i]] = true
io.WriteString(lle, lines[i])
}
real = len(uniques)
est = lle.Get()
return
}
func main() {
// real, est, _ := estimateDiscreteTimestamps("timeseries.txt")
// fmt.Println("real:", real)
// fmt.Println("estimated:", est)
e := NewHLLEstimator(6)
uniques := map[string]bool{}
for i := 0; i < 1000000; i++ {
b := getRandomBytes()
uniques[string(b)] = true
e.Write(b)
}
est := e.Get()
fmt.Printf("estimated: %f (%f%% err)\n", est, math.Abs(100.0-(est/float64(len(uniques))*100)))
fmt.Println("real:", len(uniques))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment