Skip to content

Instantly share code, notes, and snippets.

@lukashavrlant
Last active August 29, 2015 14:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lukashavrlant/96f1126cebfe6991fa09 to your computer and use it in GitHub Desktop.
Save lukashavrlant/96f1126cebfe6991fa09 to your computer and use it in GitHub Desktop.
from uuid import uuid4
from math import log
def alpha(num_buckets):
return (0.7213 / (1 + 1.079 / num_buckets))
def trailing_zeroes(number):
if number == 0:
return 0
zeroes = 0
while (number & 1) == 0:
zeroes += 1
number = number >> 1
return zeroes
def count_maxims(values, k):
num_buckets = 2 ** k
max_zeroes = [0] * num_buckets
for value in values:
h = hash(value)
bucket_index = h & (num_buckets - 1)
bucket_hash = h >> k
num_zeros = trailing_zeroes(bucket_hash) + 1
max_zeroes[bucket_index] = max(max_zeroes[bucket_index], num_zeros)
return max_zeroes
def linear_counting(number_of_ones, num_buckets):
number_of_zeros = num_buckets - number_of_ones
Z = float(number_of_zeros) / num_buckets
p = (-num_buckets) * log(Z)
return p
def estimate_cardinality(values, k):
num_buckets = 2 ** k
max_zeroes = count_maxims(values, k)
estimate = float(alpha(num_buckets) * (num_buckets**2)) / sum(2**(-r) for r in max_zeroes)
number_of_ones = sum(1 if x > 0 else 0 for x in max_zeroes)
if (estimate <= 2.5 * num_buckets) and (number_of_ones < num_buckets):
return linear_counting(number_of_ones, num_buckets)
else:
return estimate
for x in range(1, 7):
num_values = 10**x
values = (str(uuid4()) for _ in xrange(num_values))
cardinality = estimate_cardinality(values, 14)
print "Cardinality: %s, relative error: %s" % (cardinality, num_values / cardinality)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment