Skip to content

Instantly share code, notes, and snippets.

@devdazed
Created October 11, 2012 16:14
Show Gist options
  • Star 11 You must be signed in to star a gist
  • Fork 8 You must be signed in to fork a gist
  • Save devdazed/3873524 to your computer and use it in GitHub Desktop.
Save devdazed/3873524 to your computer and use it in GitHub Desktop.
Simple Linear Probabilistic Counters
"""
Simple Linear Probabilistic Counters
Credit for idea goes to:
http://highscalability.com/blog/2012/4/5/big-data-counting-how-to-count-a-billion-distinct-objects-us.html
http://highlyscalable.wordpress.com/2012/05/01/probabilistic-structures-web-analytics-data-mining/
Installation:
pip install smhasher
pip install bitarray
"""
import smhasher
import math
import itertools
from bitarray import bitarray
class LPCounter(object):
"""
Real Time Linear Probabilistic Counter of Unique Items
"""
def __init__(self, max_space):
"""
Initializes the counter, max_space is the maximum amount of memory
(in KB) you want to use to maintain your counter, more memory=more
accurate
"""
self.bit_map = bitarray(8 * 1024 * max_space)
self.bit_map.setall(False)
def current_count(self):
"""
Gets the current value of the bitmap, to do that we follow the formula:
-size * ln(unset_bits/size)
"""
ratio = float(self.bit_map.count(False)) / float(self.bit_map.length())
if ratio == 0.0:
return self.bit_map.length()
else:
return -self.bit_map.length() * math.log(ratio)
def increment(self, item):
"""
Counts an item
"""
mm_hash = smhasher.murmur3_x64_128(str(item))
offset = mm_hash % self.bit_map.length()
self.bit_map[offset] = True
if __name__ == '__main__':
def run_counts(tries, size):
"""Run some counts"""
lpc = LPCounter(size)
print "%d Kilobyte Bitmap" % size
print " Incrementing %d items" % tries
for i in itertools.count(0):
if i > tries:
break
lpc.increment(i)
count = lpc.current_count()
print " Count: %d" % count
print " Actual: %d" % tries
print " Error: %f%%" % abs((1 - (float(count) / float(tries))) * 100)
run_counts(1000000000, 16384)
@devdazed
Copy link
Author

32768 Kilobyte Bitmap
  Incrementing 1000000000 items
  Counting
  Count:  999991219
  Actual: 1000000000
  Error:  0.000878%

@devdazed
Copy link
Author

16384 Kilobyte Bitmap
  Incrementing 1000000000 items
  Count:  1000288512
  Actual: 1000000000
  Error:  0.028851%

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment