Skip to content

Instantly share code, notes, and snippets.

@lujjjh
Created May 24, 2021 04:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lujjjh/fecf55f6827cce011bb71682e9b8594a to your computer and use it in GitHub Desktop.
Save lujjjh/fecf55f6827cce011bb71682e9b8594a to your computer and use it in GitHub Desktop.
Binary search
#!/usr/bin/env python3
import string
import random
from hashlib import md5
import struct
import sys
from bisect import bisect_left
import time
# generate kvpairs
num_of_pairs = 1000000
random.seed(0)
kvpairs = []
for id in range(num_of_pairs):
s = "".join(random.choices(string.ascii_letters, k=64))
if id + 1 == 10086:
print(s)
key = md5(s.encode("utf-8")).digest()
kvpairs.append((key, id + 1, s))
# build a compact structure
kvpairs.sort(key=lambda pair: pair[0])
keys = b"".join([pair[0] for pair in kvpairs]) # 15 MB
values = b"".join([struct.pack("!i", pair[1]) for pair in kvpairs]) # 4 MB
kvpairs = None
# define View of keys for bisection
class KeysView:
key_size = 16
def __init__(self, keys):
self.keys = keys
def __getitem__(self, key):
return self.keys[key*self.key_size: (key+1)*self.key_size]
def __len__(self):
return len(self.keys) // self.key_size
keys_view = KeysView(keys)
# print memory usage
memory_usage_in_bytes = sys.getsizeof(keys) + sys.getsizeof(values)
print(f"memory usage: {memory_usage_in_bytes/1024/1024:.2f} MB") # 19.07 MB
# binary search
s = "awnLspJynPnOvWqdZnsMiqvcKOPjHuAGvBYUVLIoHaDXNgdkVtCapPCBlSkLKCpp"
start_time = time.time()
key = md5(s.encode()).digest()
index = bisect_left(keys_view, key)
if index != len(keys_view) and keys_view[index] == key:
(id,) = struct.unpack("!i", values[index * 4: (index + 1) * 4])
print(id) # 10086
else:
print('not found')
print(f"{(time.time()-start_time) * 1000:.3f} ms")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment