Skip to content

Instantly share code, notes, and snippets.

@giwa
Created March 13, 2015 13:52
Show Gist options
  • Save giwa/bce63f3e2bd493167d92 to your computer and use it in GitHub Desktop.
Save giwa/bce63f3e2bd493167d92 to your computer and use it in GitHub Desktop.
An implementation of lossy counting
"Lossy Counting"
from collections import defaultdict
class LossyCounting(object):
'Implemendation of Lossy Counting'
def __init__(self, epsilon):
self._n = 0
self._count = defaultdict(int)
self._bucket_id = {}
self._epsilon = epsilon
self._current_bucket_id = 1
def get_count(self, item):
'Return the number of the item'
return self._count[item]
def get_bucket_id(self, item):
'Return the bucket id corresponding to the item'
return self._bucket_id[item]
def add_count(self, item):
'Add item for counting'
self._n += 1
if item not in self._count:
self._bucket_id[item] = self._current_bucket_id - 1
self._count[item] += 1
if self._n % int(1 / self._epsilon) == 0:
self._trim()
self._current_bucket_id += 1
def get_iter_with_threshold_rate(self, threshold_rate):
return self.get_iter(threshold_rate * self._n)
def _trim(self):
'trim data which does not fit the criteria'
for item, total in self._count.items():
if total <= self._current_bucket_id - self._bucket_id[item]:
del self._count[item]
del self._bucket_id[item]
def get_iter(self, threshold_count):
assert threshold_count > self._epsilon * self._n, "too small threshold"
self._trim()
for item, total in self._count.iteritems():
if total >= threshold_count - self._epsilon * self._n:
yield (item, total)
else:
raise StopIteration
if __name__ == "__main__":
import random
counter = LossyCounting(5e-3)
stream = ''
for i, c in enumerate('asdfghjkl', 1):
stream += c * 2 ** i
stream = list(stream)
print len(stream)
random.shuffle(stream)
for c in stream:
counter.add_count(c)
for item, count in sorted(counter.get_iter(10),
key=lambda x: x[1], reverse=True):
print item, count, counter.get_bucket_id(item)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment