Skip to content

Instantly share code, notes, and snippets.

@parasyte
Created June 20, 2013 19:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save parasyte/5825697 to your computer and use it in GitHub Desktop.
Save parasyte/5825697 to your computer and use it in GitHub Desktop.
Memory and CPU optimized HyperLogLog (Python) patch Original: https://github.com/svpcom/hyperloglog
diff --git a/hyperloglog/hll.py b/hyperloglog/hll.py
index cb6b6bc..a16b730 100755
--- a/hyperloglog/hll.py
+++ b/hyperloglog/hll.py
@@ -1,5 +1,7 @@
"""
-This module implements probabilistic data structure which is able to calculate the cardinality of large multisets in a single pass using little auxiliary memory
+This module implements a probabilistic data structure which is able to calculate
+the cardinality of large multi-sets in a single pass using little auxiliary
+memory.
"""
import math
@@ -15,11 +17,15 @@ def get_treshold(p):
def estimate_bias(E, p):
bias_vector = biasData[p - 4]
nearest_neighbors = get_nearest_neighbors(E, rawEstimateData[p - 4])
- return sum(float(bias_vector[i]) for i in nearest_neighbors) / len(nearest_neighbors)
+ return sum(
+ float(bias_vector[i]) for i in nearest_neighbors
+ ) / len(nearest_neighbors)
def get_nearest_neighbors(E, estimate_vector):
- distance_map = sorted(((E - float(val)) ** 2, idx) for idx, val in enumerate(estimate_vector))
+ distance_map = sorted(
+ ((E - float(val)) ** 2, idx) for idx, val in enumerate(estimate_vector)
+ )
return list(idx for dist, idx in distance_map)[:6]
@@ -53,11 +59,11 @@ class HyperLogLog(object):
HyperLogLog cardinality counter
"""
- __slots__ = ('alpha', 'p', 'm', 'M')
+ __slots__ = ('p', 'm', 'M', 'max_width')
- def __init__(self, error_rate):
+ def __init__(self, error_rate, items=(), state=""):
"""
- Implementes a HyperLogLog
+ Implements a HyperLogLog
error_rate = abs_err / cardinality
"""
@@ -65,32 +71,27 @@ class HyperLogLog(object):
if not (0 < error_rate < 1):
raise ValueError("Error_Rate must be between 0 and 1.")
- # error_rate = 1.04 / sqrt(m)
- # m = 2 ** p
- # M(1)... M(m) = 0
-
p = int(math.ceil(math.log((1.04 / error_rate) ** 2, 2)))
+ m = (1 << p)
- self.alpha = get_alpha(p)
self.p = p
- self.m = 1 << p
- self.M = [ 0 for i in range(self.m) ]
+ self.max_width = 64 - p
+ self.m = m - 1
+ self.M = bytearray(state or m)
+
+ for item in items:
+ self.add(item)
def add(self, value):
"""
Adds the item to the HyperLogLog
"""
- # h: D -> {0,1} ** 64
- # x = h(v)
- # j = <x_0x_1..x_{p-1}>
- # w = <x_{p}x_{p+1}..>
- # M[j] = max(M[j], rho(w))
x = long(sha1(value).hexdigest()[:16], 16)
- j = x & (self.m - 1)
+ j = x & self.m
w = x >> self.p
- self.M[j] = max(self.M[j], get_rho(w, 64 - self.p))
+ self.M[j] = max(self.M[j], self.max_width - w.bit_length() + 1)
def update(self, *others):
"""
@@ -101,30 +102,40 @@ class HyperLogLog(object):
if self.m != item.m:
raise ValueError('Counters precisions should be equal')
- self.M = list(max(*items) for items in zip(*([ item.M for item in others ] + [ self.M ])))
+ self.M = bytearray([
+ max(*items) for items in zip(
+ *([ item.M for item in others ] + [ self.M ])
+ )
+ ])
def __eq__(self, other):
if self.m != other.m:
raise ValueError('Counters precisions should be equal')
- return self.M == other.M
+ return [ i for i in self.M ] == [ i for i in other.M ]
def __ne__(self, other):
return not self.__eq__(other)
def __len__(self):
- return round(self.card())
+ return int(round(self.card()))
def card(self):
"""
Returns the estimate of the cardinality
"""
- E = self.alpha * float(self.m ** 2) / sum(math.pow(2.0, -x) for x in self.M)
- Ep = (E - estimate_bias(E, self.p)) if E <= 5 * self.m else E
+ m = self.m + 1
+ E = get_alpha(self.p) * float(m ** 2) / sum(
+ math.pow(2.0, -x) for x in self.M
+ )
+ Ep = (E - estimate_bias(E, self.p)) if E <= 5 * m else E
#count number or registers equal to 0
- V = self.M.count(0)
- H = self.m * math.log(self.m / float(V)) if V > 0 else Ep
+ V = [ i for i in self.M ].count(0)
+ H = m * math.log(m / float(V)) if V > 0 else Ep
return H if H <= get_treshold(self.p) else Ep
+if __name__ == "__main__":
+ a = HyperLogLog(0.01, items=[ str(i) for i in xrange(1000000) ])
+ print len(a)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment