Skip to content

Instantly share code, notes, and snippets.

@vwood
Created August 16, 2012 05:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vwood/3367209 to your computer and use it in GitHub Desktop.
Save vwood/3367209 to your computer and use it in GitHub Desktop.
count min
#!/usr/bin/env python2
from math import log
from random import randint
def log2(n):
return log(n) / log(2)
#
# Simple count min
#
def universal_hash(n, a, b):
"""Provides a universal hash, where:
n is the number of buckets (a power of two)
a and b are parameters."""
p = 323772423883463
def fn(x):
return (a * x + b) % p % n
return fn
class Simple_Count_Min():
def __init__(self, N):
# Ensure we have a power of 2
self.N = N
self.hash = universal_hash(self.N, randint(0, 2**32), randint(0,2**32))
self.contents = [0] * self.N
def set(self, x, n):
hash = self.hash(x)
self.contents[hash] = max(n, self.contents[hash])
def get(self, x):
hash = self.hash(x)
return self.contents[hash]
class Count_Min():
def __init__(self, N, hash_count):
# Ensure we have a power of 2
self.N = N
self.hashes = [universal_hash(self.N, randint(0, 2**32), randint(0,2**32)) for _ in range(hash_count)]
self.contents = [0] * self.N
def set(self, x, n):
for hash_fn in self.hashes:
key = hash_fn(x)
self.contents[key] = max(n, self.contents[key])
def get(self, x):
return min(self.contents[hash_fn(x)] for hash_fn in self.hashes)
def test(n, hash_count, test_count = 1000):
stored = []
sketch = Count_Min(n, hash_count)
failures = 0
failure_size = 0
for _ in range(test_count):
k, v = randint(0, 2**32), randint(0, 2**32)
sketch.set(k, v)
stored.append((k, v))
for k, v in stored:
if sketch.get(k) != v:
failures += 1
failure_size += sketch.get(k) - v
return failures / float(test_count), 0 if failures == 0 else log(failure_size / float(failures))
if __name__ == '__main__':
print " " * 8,
for hash_count in range(1, 12):
print "%5d" % (hash_count,),
print
for n in range(4, 24):
print "%8d" % (2 ** n,),
for hash_count in range(1, 12):
print "%5.3f" % test(2**n, hash_count)[0],
print
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment