Skip to content

Instantly share code, notes, and snippets.

@athomason
Last active October 30, 2019 09:25
Show Gist options
  • Save athomason/650eaa8deacbc412f8ec7d226ba8e23e to your computer and use it in GitHub Desktop.
Save athomason/650eaa8deacbc412f8ec7d226ba8e23e to your computer and use it in GitHub Desktop.
Cardinality estimator
// An implementation of https://arxiv.org/abs/1512.07901
//
// Simple set cardinality estimation through random sampling
//
// We present a simple algorithm that estimates the cardinality n of a set V
// when allowed to sample elements of V uniformly and independently at random.
// Our algorithm with probability (1−δ) returns a (1±ε)−approximation of n...
package main
import (
"flag"
"fmt"
"math"
"math/rand"
"time"
)
// ApproximateCardinality estimates the cardinality of the set that is probed
// by sampler. With error rate δ, it returns a value within a factor 1±ε of the
// actual cardinality.
func ApproximateCardinality(sampler func() int, ε float64, δ float64, realN int) int {
var (
seen = make(map[int]struct{}) // 1: 𝑆 ← ∅
weightedSamples = 0 // 2: 𝑤 ← 0
repeats = 0 // 3: 𝑟 ← 0
threshold = int(math.Ceil((2 + 4.4*ε) / (ε * ε) * math.Log(3/δ))) // 4: 𝑘 ← ⌈(2+4.4𝜖)/𝜖²⋅ln(3/δ)⌉
)
for i := 0; ; i++ { // 5: 𝐫𝐞𝐩𝐞𝐚𝐭
weightedSamples += len(seen) // 6: 𝑤 ← 𝑤 + |𝑆|
e := sampler() // 7: 𝑒 ← 𝐒𝐀𝐌𝐏𝐋𝐄(𝑉)
if _, ok := seen[e]; ok {
repeats++ // 8: 𝑟 ← 𝑟 + |𝑆 ∩ 𝑒|
} else {
seen[e] = struct{}{} // 9: 𝑆 ← 𝑆 ∪ 𝑒
}
if repeats >= threshold { // 10: 𝐮𝐧𝐭𝐢𝐥 𝑟≥𝑘
if false {
// CARDAPPROX... invokes 𝐒𝐀𝐌𝐏𝐋𝐄(𝑉) at most min(𝑛, 2⌈√𝑘𝑛⌉) + 𝑘 times
maxSamples := int(
math.Min(
float64(realN),
2*math.Ceil(math.Sqrt(float64(threshold*realN))),
),
) + threshold
fmt.Printf("took %d, expected %d\n", i, maxSamples)
}
return int(float64(weightedSamples) / float64(repeats)) // 11: 𝐫𝐞𝐭𝐮𝐫𝐧 𝑤/𝑟
}
}
}
func main() {
var (
maxN = flag.Int("n", 64, "maximum cardinality")
trials = flag.Int("trials", 1000, "trials")
ε = flag.Float64("epsilon", 1./64, "epsilon") // accurate to one host in a mega64
δ = flag.Float64("delta", .01, "probability bound") // with 99% probability
)
flag.Parse()
rand.Seed(time.Now().UnixNano())
mispredictions := 0 // count errors
for i := 0; i < *trials; i++ { // through many attempts
n := 1 + rand.Intn(*maxN) // pick N≥1
sampler := func() int { return rand.Intn(n) } // returns N distinct values
guess := ApproximateCardinality(sampler, *ε, *δ, n)
fmt.Printf("actual=%d -> guess=%d (%d)\n", n, guess, n-guess)
if n != guess {
mispredictions++
}
}
// error rate should be less than δ
fmt.Printf("error rate: %d/%d = %.1f%%\n",
mispredictions, *trials, 100*float64(mispredictions)/float64(*trials))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment