Skip to content

Instantly share code, notes, and snippets.

@adambard
Created July 25, 2018 19:20
Show Gist options
  • Save adambard/243da9241946da8db0d904dbddd3fe70 to your computer and use it in GitHub Desktop.
Save adambard/243da9241946da8db0d904dbddd3fe70 to your computer and use it in GitHub Desktop.
A Django HLLField (HyperLogLog)
# Presented without warranty, but seems to work ok
from django.db import models
# From https://github.com/ascv/HyperLogLog
from HLL import HyperLogLog
def init_hll(m, seed, bytes=None):
hll = HyperLogLog(m, seed)
if bytes is not None:
hll.set_registers(bytes)
return hll
class HLLField(models.BinaryField):
"""
Use a HyperLogLog for efficient counting, storing its state in a binary field
"""
def __init__(self, register_exponent=5, seed=314, *args, **kwargs):
self.register_exponent = register_exponent
self.seed = seed
super(HLLField, self).__init__(*args, **kwargs)
def deconstruct(self):
name, path, args, kwargs = super(HLLField, self).deconstruct()
if self.seed != 314:
kwargs['seed'] = self.seed
if self.register_exponent != 5:
kwargs['register_exponent'] = self.register_exponent
return name, path, args, kwargs
def from_db_value(self, value, expression, connection, context):
if value is None:
return value
elif not isinstance(value, bytearray):
value = bytearray(value)
return init_hll(self.register_exponent, self.seed, value)
def to_python(self, value):
if isinstance(value, HyperLogLog):
return value
elif value is None:
return value
elif not isinstance(value, bytearray):
value = bytearray(value)
return init_hll(self.register_exponent, self.seed, super(HLLField, self).to_python(value))
def get_prep_value(self, value):
v = value.registers()
return super(HLLField, self).get_prep_value(v)
def value_to_string(self, value):
return "HLL(cardinality={})".format(value.cardinality())
# Usage
class MyModel(models.Model):
# register_exponent goes from 2-16, creating an hll with 2^16 registers
# More registers=slower, but more accurate
# Seed is just a random number for Murmer, pick something and don't change it
hll = HLLField(register_exponent=8, seed=123)
m = MyModel(hll=init_hll(8, 123))
m.hll.add("Something to count")
m.hll.add("Something else to count")
m.hll.add("I guess we're counting sentences now?")
m.hll.add("Duplicates won't be double-counted")
m.hll.add("Duplicates won't be double-counted")
m.save() # HLL is serialized and saved
m2 = MyModel.objects.get()
m2.hll.cardinality() # 4.0xxx or so
m3 = MyModel(hll=init_hll(8, 123))
m3.hll.add("a")
m3.hll.add("b")
m3.hll.add("Duplicates won't be double-counted")
m3.hll.add("Duplicates won't be double-counted")
m2.hll.merge(m3.hll)
m2.hll.cardinality() # 6.0xx or so
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment