Skip to content

Instantly share code, notes, and snippets.

@mynameisfiber
Last active December 14, 2015 01:19
Show Gist options
  • Save mynameisfiber/5005535 to your computer and use it in GitHub Desktop.
Save mynameisfiber/5005535 to your computer and use it in GitHub Desktop.
This is a quick kmaxhash implemintation to test out the usefulness of this datastructure for calculating jaccard metrics. It is quick because I wrote it quickly :)
#!/usr/bin/env python2.7
#
# This is a quick kmaxhash implemintation to test out the usefulness of this
# datastructure for calculating jaccard metrics. It is quick because I wrote
# it quickly :)
# micha gorelick, micha@bit.ly
#
import mmh3
import heapq
from itertools import (chain, ifilterfalse)
MAX_32BIT_INT = 2**31 - 1
def unique_everseen(iterable, key=None):
"List unique elements, preserving order. Remember all elements ever seen."
# unique_everseen('AAAABBBCCDAABBB') --> A B C D
# unique_everseen('ABBCcAD', str.lower) --> A B C D
seen = set()
seen_add = seen.add
if key is None:
for element in ifilterfalse(seen.__contains__, iterable):
seen_add(element)
yield element
else:
for element in iterable:
k = key(element)
if k not in seen:
seen_add(k)
yield element
class QuickKMaxHash:
def __init__(self, items=[], k=20, hasher=mmh3.hash):
self.kmin = []
self.k = k
self.hasher = hasher
self.update(items)
def _idx(self, key):
return self.hasher(str(key))
def add(self, key):
idx = self._idx(key)
if len(self.kmin) < self.k:
heapq.heappush(self.kmin, idx)
else:
if idx > self.kmin[0]:
heapq.heappushpop(self.kmin, idx)
def update(self, keys):
for key in keys:
self.add(key)
def _aux_union(self, other):
X = set(chain(self.kmin, other.kmin))
n = 0
small, large_list = sorted((self.kmin, other.kmin), key=len)
large_set = set(large_list)
for item in small:
if item in X and item in large_set:
n += 1
return n, X
def union(self, other, newk=0):
newk = newk or min(self.k, other.k)
self.kmin = heapq.nlargest(newk, unique_everseen(chain(self.kmin, other.kmin)))
heapq.heapify(self.kmin)
def jaccard(self, other, k=0):
k = min(self.k, other.k)
n, _ = self._aux_union(other)
return n / (1.0 * k)
def cardinality_intersection(self, other, k=0):
k = min(self.k, other.k)
n, X = self._aux_union(other)
cardX = self._cardhelp(min(X), len(X))
return n / (1.0 * k) * cardX
def cardinality_union_(self, other, k=0):
_, X = self._aux_union(other)
cardX = self._cardhelp(min(X), len(X))
return cardX
def _cardhelp(self, kmin, k):
return 2.0 * (k - 1.0) * MAX_32BIT_INT / (MAX_32BIT_INT - kmin)
def cardinality(self):
if len(self.kmin) < self.k:
return len(self.kmin)
return self._cardhelp(self.kmin[0], self.k)
def __len__(self):
return int(self.cardinality())
def __add__(self, other):
assert other.k == self.k
nt = QuickKMaxHash(k = self.k)
nt.kmin = self.kmin
nt.union(other)
return nt
def test_constructor():
t1 = QuickKMaxHash(range(50))
t2 = QuickKMaxHash(range(50), k=50)
t1 = QuickKMaxHash(k=10)
t1 = QuickKMaxHash()
def test_add():
t1 = QuickKMaxHash(k=1)
t1.add("TEST1")
assert t1.kmin != []
assert len(t1.kmin) == 1
t1.add("TEST2")
assert t1.kmin != []
assert len(t1.kmin) == 1
def test_update():
t1 = QuickKMaxHash(k=15)
t1.update(range(10))
assert len(t1.kmin) == 10
def test_jaccard():
# k == 400 => relative error of ~0.05
t1 = QuickKMaxHash(range(500), k=400)
t2 = QuickKMaxHash(range(100, 500), k=400)
j_kmin = t1.jaccard(t2)
j_real = 4. / 5.
error = 1 / (t1.k)**0.5
assert abs(1 - j_kmin / j_real) <= error
def test_union():
t1 = QuickKMaxHash(range(10), k=5)
t2 = QuickKMaxHash(range(10), k=5)
t3 = QuickKMaxHash(range(20), k=5)
t1.union(t2)
assert set(t1.kmin) == set(t2.kmin)
t1.union(t3)
assert set(t1.kmin) == set(t3.kmin)
t4 = QuickKMaxHash(range(40,50), k=5)
t5 = t4 + t1
assert set(t1.kmin) != set(t5.kmin)
assert set(t4.kmin) != set(t5.kmin)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment