-
-
Save mudge/0c7ea37ac37d0fedb84804a3b6df9f76 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'digest/murmurhash' | |
class HyperLogLog | |
HASH_LENGTH = 32 | |
ALPHA = { | |
16 => 0.673, | |
32 => 0.697, | |
64 => 0.709 | |
}.freeze | |
attr_reader :b, :m, :registers | |
def initialize(b = 4) | |
raise ArgumentError, 'b must be 4..16' unless (4..16).cover?(b) | |
@b = b | |
@m = 2 ** b | |
@registers = Array.new(m, 0) | |
end | |
def insert(v) | |
x = h(v) | |
suffix_length = HASH_LENGTH - b | |
j = x >> suffix_length # first b bits of the 32 bit hash | |
w = x & ((1 << suffix_length) - 1) # the remaining bits of the hash | |
registers[j] = [registers[j], rho(w)].max | |
end | |
def count | |
e = alpha * (m ** 2) * (mean ** -1) | |
if e <= ((5.0 / 2.0) * m) | |
v = registers.count(0) | |
if v.positive? | |
e_star = m * Math.log10(m / v) | |
else | |
e_star = e | |
end | |
elsif e <= ((1.0 / 30.0) * (2 ** 32)) | |
e_star = e | |
elsif e > ((1.0 / 30.0) * (2 ** 32)) | |
e_star = (-2 ** 32) * Math.log10(1 - (e / (2 ** 32))) | |
end | |
e_star.round | |
end | |
def alpha | |
ALPHA.fetch(m) { | |
raise ArgumentError, 'unsupported value of m' unless m >= 128 | |
0.7213 / (1 + 1.079 / m) | |
} | |
end | |
def mean | |
registers.map { |register| 2.0 ** -register }.reduce(:+) | |
end | |
def h(d) | |
Digest::MurmurHash3_x86_32.rawdigest(d) | |
end | |
def rho(s) | |
(HASH_LENGTH - b - s.bit_length) + 1 | |
end | |
def error | |
1.04 / Math.sqrt(m) | |
end | |
end | |
RSpec.describe HyperLogLog do | |
describe '.new' do | |
it 'initialises all registers to zero' do | |
hyperloglog = HyperLogLog.new(4) | |
expect(hyperloglog.registers).to eq([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) | |
end | |
end | |
describe '#insert' do | |
it 'populates registers' do | |
hyperloglog = HyperLogLog.new(4) | |
hyperloglog.insert('foo') | |
expect(hyperloglog.registers).to eq([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]) | |
end | |
end | |
describe '#alpha' do | |
it 'returns 0.673 for 4 registers' do | |
hyperloglog = HyperLogLog.new(4) | |
expect(hyperloglog.alpha).to eq(0.673) | |
end | |
it 'returns 0.697 for 5 registers' do | |
hyperloglog = HyperLogLog.new(5) | |
expect(hyperloglog.alpha).to eq(0.697) | |
end | |
it 'returns 0.709 for 6 registers' do | |
hyperloglog = HyperLogLog.new(6) | |
expect(hyperloglog.alpha).to eq(0.709) | |
end | |
it 'calculates a value for over 6 registers' do | |
hyperloglog = HyperLogLog.new(7) | |
expect(hyperloglog.alpha).to eq(0.7152704932638152) | |
end | |
end | |
describe '#h' do | |
it 'converts input into 32 bit binary numbers' do | |
hyperloglog = HyperLogLog.new | |
expect(hyperloglog.h('foo').bit_length).to eq(32) | |
end | |
it 'converts similar input into very different binary numbers' do | |
hyperloglog = HyperLogLog.new | |
expect(hyperloglog.h('foo')).not_to eq(hyperloglog.h('fop')) | |
end | |
end | |
describe '#rho' do | |
it 'returns the position of the leftmost 1-bit of s' do | |
hyperloglog = HyperLogLog.new | |
expect(hyperloglog.rho(0b1000000000000000000000000000)).to eq(1) | |
end | |
it 'returns the leftmost 1-bit of s with leading zeroes' do | |
hyperloglog = HyperLogLog.new | |
expect(hyperloglog.rho(0b0001000000000000000000000000)).to eq(4) | |
end | |
it 'returns the leftmost 1-bit of s with leading zeroes' do | |
hyperloglog = HyperLogLog.new | |
expect(hyperloglog.rho(0b0000000000000000000000000001)).to eq(28) | |
end | |
it 'returns K+1 for K zeroes' do | |
hyperloglog = HyperLogLog.new | |
expect(hyperloglog.rho(0b0000000000000000000000000000)).to eq(29) | |
end | |
end | |
describe '#count' do | |
it 'guesses correctly' do | |
hyperloglog = HyperLogLog.new | |
1000.times do |i| | |
hyperloglog.insert("#{i}") | |
end | |
expect(hyperloglog.count).to be_within(260).of(1000) | |
end | |
end | |
describe 'bit shifting' do | |
it 'gets the first 4 bits of a 32 bit integer' do | |
expect(0b10100000000000000000000000000000 >> (32 - 4)).to eq(0b1010) | |
end | |
it 'gets the remaining bits of a 32 bit integer' do | |
expect(0b10101000000000000000000000000000 & ((1 << (32 - 4)) - 1)).to eq(0b1000000000000000000000000000) | |
end | |
it 'gets the remaining bits of a 32 bit integer with leading zeroes' do | |
expect(0b10100100000000000000000000000000 & ((1 << (32 - 4)) - 1)).to eq(0b0100000000000000000000000000) | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment