Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@mnot
Created November 25, 2015 01:55
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mnot/1f0ca31c8cc991f5798c to your computer and use it in GitHub Desktop.
Save mnot/1f0ca31c8cc991f5798c to your computer and use it in GitHub Desktop.
#!/usr/bin/python
from __future__ import division
from bitarray import bitarray
from hashlib import sha256
from math import log
import struct
def gcs_hash(w, (N,P)):
h = sha256(w).hexdigest()
h = long(h,16)
return h % (N*P)
def gcs_encode(V, P, out):
Q = V // P
R = V % P
logp = int(log(P,2))
# write Q as unary
out.extend([True for i in range(Q)] + [False])
# write log2(P) most significant bits of R
out.extend([R & (1 << i) and True or False for i in range(logp-1,-1,-1)])
def main(urls, P, filename):
N = len(urls)
hash_values = [ gcs_hash(i, (N, P)) for i in urls ]
hash_values.append(0)
hash_values.sort()
out = bitarray(endian="big") # network
for i in range(len(hash_values) - 1):
delta = hash_values[i+1] - hash_values[i]
if delta == 0:
continue
gcs_encode(delta, P, out)
result = out.tobytes()
f = open(filename, 'wb')
# write N and P as unsigned long integers
f.write(struct.pack("!LL", N, P))
f.write(result)
print "* size:", f.tell()
f.close()
if __name__ == "__main__":
urls = [str(n) for n in xrange(1000)]
P = 2**8
main(urls, P, 'table.gcs')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment