robertobarreda/count_min_sketch.py

## count_min_sketch.py
from __future__ import division

from xxhash import xxh32
import numpy as np

DTYPE = np.int64


class CountMinSketch(object):

    def __init__(self, _w=None, _d=None, _delta=None, _epsilon=None):
        """
        CountMinSketch is an implementation of the count min sketch
        algorithm that probabilistically counts string frequencies.

        You must either supply w and d directly, or let them be calculated form error,
        delta, and epsilon. If You choose the latter, then w = ceil(error/epsilon) and
        d = ceil(ln(1.0/delta)) where the error in answering a query is within a factor
        of epsilon with probability delta.

        Parameters
        ----------
        w : the number of columns in the count matrix
        d : the number of rows in the count matrix
        delta : (not applicable if w and d are supplied) the probability of query error
        epsilon : (not applicable if w and d are supplied) the query error factor

        bits : The size of the hash output space

        For the full paper on the algorithm, see the paper
        "An improved data stream summary: the count-min sketch and its -
        applications" by Cormode and Muthukrishnan, 2003.
        """

        if _w is not None and _d is not None:
            self.w = _w
            self.d = _d
        elif _delta is not None and _epsilon is not None:
            self.w = np.ceil(np.e / _epsilon)
            self.d = np.ceil(np.log(1./_delta))
        else:
            raise Exception(
                "You must either supply both w and d or delta and epsilon.")

        self.count = np.zeros((self.d, self.w), dtype=DTYPE)
        self.rows = np.arange(self.d)
        self.shift_by = np.ceil(np.log(self.w) / np.log(2))

    def get_columns(self, a):
        a_string = str(a)
        hashes = np.zeros(self.d, dtype=DTYPE)
        h = xxh32(a_string).intdigest()

        for i in xrange(self.d):
            hashes[i] = h % self.w
            h >>= self.shift_by
            if h < self.w and i < self.d:
                cur_string = str(i) + a_string
                h = xxh32(cur_string).intdigest()

        return hashes

    def update(self, a, val=1):
        h = self.get_columns(a)
        self.count[self.rows, h] += val

    def query(self, a):
        h = self.get_columns(a)
        return self.count[self.rows, h].min()

    def __getitem__(self, a):
        return self.query(a)

    def __setitem__(self, a, val):
        cur = self.query(a)
        self.update(a, val - cur)
	from __future__ import division

	from xxhash import xxh32
	import numpy as np

	DTYPE = np.int64


	class CountMinSketch(object):

	def __init__(self, _w=None, _d=None, _delta=None, _epsilon=None):
	"""
	CountMinSketch is an implementation of the count min sketch
	algorithm that probabilistically counts string frequencies.

	You must either supply w and d directly, or let them be calculated form error,
	delta, and epsilon. If You choose the latter, then w = ceil(error/epsilon) and
	d = ceil(ln(1.0/delta)) where the error in answering a query is within a factor
	of epsilon with probability delta.

	Parameters
	----------
	w : the number of columns in the count matrix
	d : the number of rows in the count matrix
	delta : (not applicable if w and d are supplied) the probability of query error
	epsilon : (not applicable if w and d are supplied) the query error factor

	bits : The size of the hash output space

	For the full paper on the algorithm, see the paper
	"An improved data stream summary: the count-min sketch and its -
	applications" by Cormode and Muthukrishnan, 2003.
	"""

	if _w is not None and _d is not None:
	self.w = _w
	self.d = _d
	elif _delta is not None and _epsilon is not None:
	self.w = np.ceil(np.e / _epsilon)
	self.d = np.ceil(np.log(1./_delta))
	else:
	raise Exception(
	"You must either supply both w and d or delta and epsilon.")

	self.count = np.zeros((self.d, self.w), dtype=DTYPE)
	self.rows = np.arange(self.d)
	self.shift_by = np.ceil(np.log(self.w) / np.log(2))

	def get_columns(self, a):
	a_string = str(a)
	hashes = np.zeros(self.d, dtype=DTYPE)
	h = xxh32(a_string).intdigest()

	for i in xrange(self.d):
	hashes[i] = h % self.w
	h >>= self.shift_by
	if h < self.w and i < self.d:
	cur_string = str(i) + a_string
	h = xxh32(cur_string).intdigest()

	return hashes

	def update(self, a, val=1):
	h = self.get_columns(a)
	self.count[self.rows, h] += val

	def query(self, a):
	h = self.get_columns(a)
	return self.count[self.rows, h].min()

	def __getitem__(self, a):
	return self.query(a)

	def __setitem__(self, a, val):
	cur = self.query(a)
	self.update(a, val - cur)