Skip to content

Instantly share code, notes, and snippets.

@mudge
Last active October 21, 2018 11:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mudge/0c7ea37ac37d0fedb84804a3b6df9f76 to your computer and use it in GitHub Desktop.
Save mudge/0c7ea37ac37d0fedb84804a3b6df9f76 to your computer and use it in GitHub Desktop.
require 'digest/murmurhash'
class HyperLogLog
HASH_LENGTH = 32
ALPHA = {
16 => 0.673,
32 => 0.697,
64 => 0.709
}.freeze
attr_reader :b, :m, :registers
def initialize(b = 4)
raise ArgumentError, 'b must be 4..16' unless (4..16).cover?(b)
@b = b
@m = 2 ** b
@registers = Array.new(m, 0)
end
def insert(v)
x = h(v)
suffix_length = HASH_LENGTH - b
j = x >> suffix_length # first b bits of the 32 bit hash
w = x & ((1 << suffix_length) - 1) # the remaining bits of the hash
registers[j] = [registers[j], rho(w)].max
end
def count
e = alpha * (m ** 2) * (mean ** -1)
if e <= ((5.0 / 2.0) * m)
v = registers.count(0)
if v.positive?
e_star = m * Math.log10(m / v)
else
e_star = e
end
elsif e <= ((1.0 / 30.0) * (2 ** 32))
e_star = e
elsif e > ((1.0 / 30.0) * (2 ** 32))
e_star = (-2 ** 32) * Math.log10(1 - (e / (2 ** 32)))
end
e_star.round
end
def alpha
ALPHA.fetch(m) {
raise ArgumentError, 'unsupported value of m' unless m >= 128
0.7213 / (1 + 1.079 / m)
}
end
def mean
registers.map { |register| 2.0 ** -register }.reduce(:+)
end
def h(d)
Digest::MurmurHash3_x86_32.rawdigest(d)
end
def rho(s)
(HASH_LENGTH - b - s.bit_length) + 1
end
def error
1.04 / Math.sqrt(m)
end
end
RSpec.describe HyperLogLog do
describe '.new' do
it 'initialises all registers to zero' do
hyperloglog = HyperLogLog.new(4)
expect(hyperloglog.registers).to eq([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
end
end
describe '#insert' do
it 'populates registers' do
hyperloglog = HyperLogLog.new(4)
hyperloglog.insert('foo')
expect(hyperloglog.registers).to eq([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2])
end
end
describe '#alpha' do
it 'returns 0.673 for 4 registers' do
hyperloglog = HyperLogLog.new(4)
expect(hyperloglog.alpha).to eq(0.673)
end
it 'returns 0.697 for 5 registers' do
hyperloglog = HyperLogLog.new(5)
expect(hyperloglog.alpha).to eq(0.697)
end
it 'returns 0.709 for 6 registers' do
hyperloglog = HyperLogLog.new(6)
expect(hyperloglog.alpha).to eq(0.709)
end
it 'calculates a value for over 6 registers' do
hyperloglog = HyperLogLog.new(7)
expect(hyperloglog.alpha).to eq(0.7152704932638152)
end
end
describe '#h' do
it 'converts input into 32 bit binary numbers' do
hyperloglog = HyperLogLog.new
expect(hyperloglog.h('foo').bit_length).to eq(32)
end
it 'converts similar input into very different binary numbers' do
hyperloglog = HyperLogLog.new
expect(hyperloglog.h('foo')).not_to eq(hyperloglog.h('fop'))
end
end
describe '#rho' do
it 'returns the position of the leftmost 1-bit of s' do
hyperloglog = HyperLogLog.new
expect(hyperloglog.rho(0b1000000000000000000000000000)).to eq(1)
end
it 'returns the leftmost 1-bit of s with leading zeroes' do
hyperloglog = HyperLogLog.new
expect(hyperloglog.rho(0b0001000000000000000000000000)).to eq(4)
end
it 'returns the leftmost 1-bit of s with leading zeroes' do
hyperloglog = HyperLogLog.new
expect(hyperloglog.rho(0b0000000000000000000000000001)).to eq(28)
end
it 'returns K+1 for K zeroes' do
hyperloglog = HyperLogLog.new
expect(hyperloglog.rho(0b0000000000000000000000000000)).to eq(29)
end
end
describe '#count' do
it 'guesses correctly' do
hyperloglog = HyperLogLog.new
1000.times do |i|
hyperloglog.insert("#{i}")
end
expect(hyperloglog.count).to be_within(260).of(1000)
end
end
describe 'bit shifting' do
it 'gets the first 4 bits of a 32 bit integer' do
expect(0b10100000000000000000000000000000 >> (32 - 4)).to eq(0b1010)
end
it 'gets the remaining bits of a 32 bit integer' do
expect(0b10101000000000000000000000000000 & ((1 << (32 - 4)) - 1)).to eq(0b1000000000000000000000000000)
end
it 'gets the remaining bits of a 32 bit integer with leading zeroes' do
expect(0b10100100000000000000000000000000 & ((1 << (32 - 4)) - 1)).to eq(0b0100000000000000000000000000)
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment