Skip to content

Instantly share code, notes, and snippets.

@jltqst27

jltqst27/kmv.py Secret

Created April 20, 2019 01:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jltqst27/075f7ae63df3fd4120be5286b71af323 to your computer and use it in GitHub Desktop.
Save jltqst27/075f7ae63df3fd4120be5286b71af323 to your computer and use it in GitHub Desktop.
KMV with interseciton
import mmh3
import random
from sets import Set
K_LIMIT = 4096
class KMV:
def __init__(self, k, kmv_array = None):
self.k = k;
self.kmv_array = []
if not kmv_array is None:
self.kmv_array = kmv_array
def update(self, element):
hashed_element = mmh3.hash(element, signed=False)
if len(self.kmv_array) < self.k:
self.kmv_array.append(hashed_element)
self.kmv_array.sort()
elif hashed_element < self.kmv_array[len(self.kmv_array) - 1]:
self.kmv_array.append(hashed_element)
self.kmv_array.sort()
self.kmv_array.pop()
def estimate(self):
if len(self.kmv_array) == 0:
return 0
count = 0
total = 0
first = None
for x in self.kmv_array:
if first is None:
first = x
else:
count += 1
total += (x - first)
first = x
return float(1) / ((float(total) / count) / float(4294967295))
return 0
def intersection(self, kmv_instance):
own_set = Set(self.kmv_array)
count = 0
new_kmv_array = []
for element in kmv_instance.kmv_array:
if element in own_set:
count += 1
new_kmv_array.append(element)
print "new k is " + str(count)
return KMV(K_LIMIT, new_kmv_array)
if __name__ == "__main__":
hll1 = KMV(K_LIMIT)
hll2 = KMV(K_LIMIT)
hll3 = KMV(K_LIMIT)
set1 = Set()
set2 = Set()
set3 = Set()
for x in range(2000000):
rand1 = random.randint(1, 20000000)
rand2 = random.randint(1, 20000000)
rand3 = random.randint(1, 20000000)
hll1.update(str(rand1))
hll2.update(str(rand2))
hll3.update(str(rand3))
set1.add(rand1)
set2.add(rand2)
set3.add(rand3)
print "estimate hll1: " + str(hll1.estimate())
print "estimate hll2: " + str(hll2.estimate())
print "estimate hll3: " + str(hll3.estimate())
print "kmv intersection 1_2 count: " + str(hll1.intersection(hll2).estimate())
print "kmv intersection 1_2_3 count: " + str(hll1.intersection(hll2).intersection(hll3).estimate())
print "brute intersection 1_2 : " + str(len(set1.intersection(set2)))
print "brute intersection 1_2_3 : " + str(len(set1.intersection(set2).intersection(set3)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment