Skip to content

Instantly share code, notes, and snippets.

@axiak
Created April 3, 2014 16:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save axiak/9957599 to your computer and use it in GitHub Desktop.
Save axiak/9957599 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import sys
import mmh3
import math
import random
import struct
try:
import numpy as np
from matplotlib import pyplot
except ImportError:
np = pyplot = None
CUSTOMER_CHURN_RATE = 1.1
AVERAGE_CONTACT_COUNT = 1000
SHARD_SIZE = 16
def main():
with_hash, without_hash = [], []
for customer, contact_id in contact_id_customer_pair(100):
with_hash.append(key_with_hash(customer, contact_id))
without_hash.append(simple_key(customer, contact_id))
with_hash.sort()
without_hash.sort()
if len(sys.argv) > 1:
splits = int(sys.argv[1])
else:
splits = 10
show_distribution('with hash', with_hash, splits)
show_distribution('without hash', without_hash, splits)
inputs = {
#'with hash': with_hash,
'simple key': without_hash
}
graph_distribution(inputs, max_split=100)
def show_distribution(name, input, splits=10):
print name
for i in range(0, len(input), len(input) / splits):
print repr(input[i])
print repr(input[-1])
print ''
def graph_distribution(inputs, splits=(10, 100, 250), max_split=2 ** 63 - 1):
if np is None or pyplot is None:
print "Not graphing because matplotlib/numpy isn't installed. :-("
return
keys = sorted(inputs.keys())
pyplot.xlim(0, max_split)
pyplot.figure(1)
for i, key in enumerate(keys):
pyplot.subplot(len(keys), 1, i + 1)
key_result = []
for value in inputs[key]:
key_result.append(struct.unpack(">L", value[:4])[0])
print sorted(key_result)
key_result = np.array(sorted(key_result))
for j, split in enumerate(splits):
bins = np.linspace(0, max_split, split)
pyplot.hist(key_result, bins, alpha=0.5, normed=True, histtype='step', label="{0} bins".format(split))
if not j:
pyplot.hold(True)
pyplot.grid(True)
pyplot.legend()
pyplot.title(key)
pyplot.hold(False)
pyplot.show()
def key_with_hash(customer, contact_id):
customercontact_id = struct.pack(">lq", customer, contact_id)
shard = abs(mmh3.hash(customercontact_id) % SHARD_SIZE)
prefix = mmh3.hash(chr(shard) + struct.pack(">l", customer))
return struct.pack('>l', prefix) + customercontact_id
def key_no_hash(customer, contact_id):
customercontact_id = struct.pack(">lq", customer, contact_id)
shard = abs(mmh3.hash(customercontact_id) % SHARD_SIZE)
return chr(shard) + customercontact_id
def simple_key(customer, contact_id):
return struct.pack('>lq', customer, contact_id)
def contact_id_customer_pair(num_customers):
sample = list(range(1, int(num_customers / CUSTOMER_CHURN_RATE)))
random.shuffle(sample)
customers = sorted(sample[:num_customers])
for customer in customers:
for contact_id in contact_ids_for_customer():
yield customer, contact_id
def contact_ids_for_customer():
return list(range(1, int(power_law(1, AVERAGE_CONTACT_COUNT, 10))))
def power_law(lower, upper, n):
y = random.random()
return upper - math.pow((upper ** (n + 1) - lower ** (n + 1)) * y + lower ** (n + 1), 1 / float(n + 1))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment