cjauvin/hyperloglog.py

## hyperloglog.py
import random
from hashlib import md5, sha1
from collections import defaultdict
from scipy.stats import hmean
import numpy as np
import redis
import sys

def get_n_zeros(v, n_bits=128, from_right=True):
    """Get number of trailing or leading 0s in the binary
       representation of v.
    """
    for n, i in enumerate(range(n_bits)[::(1 if from_right else -1)]):
        if v & 2 ** i:
            break
    else:
        n += 1
    return n

N = 500000
stream = np.random.randint(0, sys.maxsize, size=N)
b = 14 # number of bits for registers
m = 2 ** b # number of registers
M = defaultdict(int) # registers
from_right = True # most/least significant bits
h_func = sha1
# used if left == True
h_func_n_bits = 128 if h_func is md5 else 160
h_func_n_bits -= b
if m == 16: # b = 4
    alpha = 0.673
elif m == 32: # b = 5
    alpha = 0.697
elif m == 64: # b = 6
    alpha = 0.709
else:
    alpha = 0.7213 / (1 + 1.079 / m)

for v in stream:
    x = int(h_func(str(v).encode()).hexdigest(), 16) # between 0 and 2^128 - 1
    if not from_right:
        mask = (m - 1) << h_func_n_bits
        j = x & mask >> h_func_n_bits
        assert j >= 0 and j < m
        x = x & (mask ^ (m - 1)) # value part
        x = x >> b
        #x_bin = ('{:0%db}' % h_func_n_bits).format(x)
        #nz = len(x_bin) - len(x_bin.lstrip('0'))
    else:
        j = x & (m - 1) # register part
        assert j >= 0 and j < m
        x = x >> b # value part
        #x_bin = bin(x)
        #nz = len(x_bin) - len(x_bin.rstrip('0'))
    nz = get_n_zeros(x, n_bits=h_func_n_bits, from_right=from_right)
    M[j] = max(M[j], nz + 1)

M = np.asarray(list(M.values()))

E = alpha * (m ** 2) / np.sum(2. ** -M)
#print(E)

if E <= (5 / 2 * m):
    #V = len(M[M == 0]) # number of zero registers
    V = m - len(M)
    if V:
        E = m * np.log(m / V)

elif E > (1 / 30. * (2 ** 32)):
    E = -2 ** 32 * np.log(1 - E / 2 ** 32)

print(E)
	import random
	from hashlib import md5, sha1
	from collections import defaultdict
	from scipy.stats import hmean
	import numpy as np
	import redis
	import sys

	def get_n_zeros(v, n_bits=128, from_right=True):
	"""Get number of trailing or leading 0s in the binary
	representation of v.
	"""
	for n, i in enumerate(range(n_bits)[::(1 if from_right else -1)]):
	if v & 2 ** i:
	break
	else:
	n += 1
	return n

	N = 500000
	stream = np.random.randint(0, sys.maxsize, size=N)
	b = 14 # number of bits for registers
	m = 2 ** b # number of registers
	M = defaultdict(int) # registers
	from_right = True # most/least significant bits
	h_func = sha1
	# used if left == True
	h_func_n_bits = 128 if h_func is md5 else 160
	h_func_n_bits -= b
	if m == 16: # b = 4
	alpha = 0.673
	elif m == 32: # b = 5
	alpha = 0.697
	elif m == 64: # b = 6
	alpha = 0.709
	else:
	alpha = 0.7213 / (1 + 1.079 / m)

	for v in stream:
	x = int(h_func(str(v).encode()).hexdigest(), 16) # between 0 and 2^128 - 1
	if not from_right:
	mask = (m - 1) << h_func_n_bits
	j = x & mask >> h_func_n_bits
	assert j >= 0 and j < m
	x = x & (mask ^ (m - 1)) # value part
	x = x >> b
	#x_bin = ('{:0%db}' % h_func_n_bits).format(x)
	#nz = len(x_bin) - len(x_bin.lstrip('0'))
	else:
	j = x & (m - 1) # register part
	assert j >= 0 and j < m
	x = x >> b # value part
	#x_bin = bin(x)
	#nz = len(x_bin) - len(x_bin.rstrip('0'))
	nz = get_n_zeros(x, n_bits=h_func_n_bits, from_right=from_right)
	M[j] = max(M[j], nz + 1)

	M = np.asarray(list(M.values()))

	E = alpha * (m 2) / np.sum(2. -M)
	#print(E)

	if E <= (5 / 2 * m):
	#V = len(M[M == 0]) # number of zero registers
	V = m - len(M)
	if V:
	E = m * np.log(m / V)

	elif E > (1 / 30. * (2 ** 32)):
	E = -2 ** 32 * np.log(1 - E / 2 ** 32)

	print(E)