Created
June 20, 2013 19:11
-
-
Save parasyte/5825697 to your computer and use it in GitHub Desktop.
Memory and CPU optimized HyperLogLog (Python) patch
Original: https://github.com/svpcom/hyperloglog
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/hyperloglog/hll.py b/hyperloglog/hll.py | |
index cb6b6bc..a16b730 100755 | |
--- a/hyperloglog/hll.py | |
+++ b/hyperloglog/hll.py | |
@@ -1,5 +1,7 @@ | |
""" | |
-This module implements probabilistic data structure which is able to calculate the cardinality of large multisets in a single pass using little auxiliary memory | |
+This module implements a probabilistic data structure which is able to calculate | |
+the cardinality of large multi-sets in a single pass using little auxiliary | |
+memory. | |
""" | |
import math | |
@@ -15,11 +17,15 @@ def get_treshold(p): | |
def estimate_bias(E, p): | |
bias_vector = biasData[p - 4] | |
nearest_neighbors = get_nearest_neighbors(E, rawEstimateData[p - 4]) | |
- return sum(float(bias_vector[i]) for i in nearest_neighbors) / len(nearest_neighbors) | |
+ return sum( | |
+ float(bias_vector[i]) for i in nearest_neighbors | |
+ ) / len(nearest_neighbors) | |
def get_nearest_neighbors(E, estimate_vector): | |
- distance_map = sorted(((E - float(val)) ** 2, idx) for idx, val in enumerate(estimate_vector)) | |
+ distance_map = sorted( | |
+ ((E - float(val)) ** 2, idx) for idx, val in enumerate(estimate_vector) | |
+ ) | |
return list(idx for dist, idx in distance_map)[:6] | |
@@ -53,11 +59,11 @@ class HyperLogLog(object): | |
HyperLogLog cardinality counter | |
""" | |
- __slots__ = ('alpha', 'p', 'm', 'M') | |
+ __slots__ = ('p', 'm', 'M', 'max_width') | |
- def __init__(self, error_rate): | |
+ def __init__(self, error_rate, items=(), state=""): | |
""" | |
- Implementes a HyperLogLog | |
+ Implements a HyperLogLog | |
error_rate = abs_err / cardinality | |
""" | |
@@ -65,32 +71,27 @@ class HyperLogLog(object): | |
if not (0 < error_rate < 1): | |
raise ValueError("Error_Rate must be between 0 and 1.") | |
- # error_rate = 1.04 / sqrt(m) | |
- # m = 2 ** p | |
- # M(1)... M(m) = 0 | |
- | |
p = int(math.ceil(math.log((1.04 / error_rate) ** 2, 2))) | |
+ m = (1 << p) | |
- self.alpha = get_alpha(p) | |
self.p = p | |
- self.m = 1 << p | |
- self.M = [ 0 for i in range(self.m) ] | |
+ self.max_width = 64 - p | |
+ self.m = m - 1 | |
+ self.M = bytearray(state or m) | |
+ | |
+ for item in items: | |
+ self.add(item) | |
def add(self, value): | |
""" | |
Adds the item to the HyperLogLog | |
""" | |
- # h: D -> {0,1} ** 64 | |
- # x = h(v) | |
- # j = <x_0x_1..x_{p-1}> | |
- # w = <x_{p}x_{p+1}..> | |
- # M[j] = max(M[j], rho(w)) | |
x = long(sha1(value).hexdigest()[:16], 16) | |
- j = x & (self.m - 1) | |
+ j = x & self.m | |
w = x >> self.p | |
- self.M[j] = max(self.M[j], get_rho(w, 64 - self.p)) | |
+ self.M[j] = max(self.M[j], self.max_width - w.bit_length() + 1) | |
def update(self, *others): | |
""" | |
@@ -101,30 +102,40 @@ class HyperLogLog(object): | |
if self.m != item.m: | |
raise ValueError('Counters precisions should be equal') | |
- self.M = list(max(*items) for items in zip(*([ item.M for item in others ] + [ self.M ]))) | |
+ self.M = bytearray([ | |
+ max(*items) for items in zip( | |
+ *([ item.M for item in others ] + [ self.M ]) | |
+ ) | |
+ ]) | |
def __eq__(self, other): | |
if self.m != other.m: | |
raise ValueError('Counters precisions should be equal') | |
- return self.M == other.M | |
+ return [ i for i in self.M ] == [ i for i in other.M ] | |
def __ne__(self, other): | |
return not self.__eq__(other) | |
def __len__(self): | |
- return round(self.card()) | |
+ return int(round(self.card())) | |
def card(self): | |
""" | |
Returns the estimate of the cardinality | |
""" | |
- E = self.alpha * float(self.m ** 2) / sum(math.pow(2.0, -x) for x in self.M) | |
- Ep = (E - estimate_bias(E, self.p)) if E <= 5 * self.m else E | |
+ m = self.m + 1 | |
+ E = get_alpha(self.p) * float(m ** 2) / sum( | |
+ math.pow(2.0, -x) for x in self.M | |
+ ) | |
+ Ep = (E - estimate_bias(E, self.p)) if E <= 5 * m else E | |
#count number or registers equal to 0 | |
- V = self.M.count(0) | |
- H = self.m * math.log(self.m / float(V)) if V > 0 else Ep | |
+ V = [ i for i in self.M ].count(0) | |
+ H = m * math.log(m / float(V)) if V > 0 else Ep | |
return H if H <= get_treshold(self.p) else Ep | |
+if __name__ == "__main__": | |
+ a = HyperLogLog(0.01, items=[ str(i) for i in xrange(1000000) ]) | |
+ print len(a) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment