Skip to content

Instantly share code, notes, and snippets.

@drdee
Created July 30, 2014 16:52
Show Gist options
  • Save drdee/32b880a24bd2dd2d8d46 to your computer and use it in GitHub Desktop.
Save drdee/32b880a24bd2dd2d8d46 to your computer and use it in GitHub Desktop.
Cardinality estimator
import math
import mmh3
from functools import partial
from itertools import izip
def estimateCardinality(self, significant_bits)
'''
Taken and slightly adapted from http://blog.notdot.net/2012/09/Dam-Cool-Algorithms-Cardinality-Estimation
Estimates the number of unique elements in the input set values.
significant_bits: The number of bits of hash to use as a bucket number; there will be 2**k buckets.
key: by default this function will estimate the cardinality of the rdd, i.e how many records
there are, but you can also pass an optional key to estimate the cardinality of a particular field.
'''
def _trailingZeroes(num):
"""Counts the number of trailing 0 bits in num."""
if num == 0:
return 32
p = 0
while (num >> p) & 1 == 0:
p += 1
return p
def runEstimator(row, num_buckets, significant_bits):
max_zeroes = [0] * num_buckets
for record in records:
h = mmh3.hash(unicode(record))
bucket = h & (num_buckets - 1)
bucket_hash = h >> significant_bits
max_zeroes[bucket] = max(max_zeroes[bucket], _trailingZeroes(bucket_hash))
yield max_zeroes
max_buckets = []
num_buckets = 2 ** significant_bits
max_zeroes = self.mapPartitions(partial(runEstimator, num_buckets=num_buckets, significant_bits=significant_bits)).collect()
for x in izip(*max_zeroes):
max_buckets.append(max(x))
return math.ceil(2 ** (float(sum(max_buckets)) / num_buckets) * num_buckets * 0.79402)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment