Skip to content

Instantly share code, notes, and snippets.

@kdubovikov
Last active October 8, 2017 12:58
Show Gist options
  • Save kdubovikov/7d7598ae918b92a784b6e0470f6702e9 to your computer and use it in GitHub Desktop.
Save kdubovikov/7d7598ae918b92a784b6e0470f6702e9 to your computer and use it in GitHub Desktop.
Compare sampling algorithms
def collect_collisions(arr, sample_size, n_samples,
fast=False):
"""Collect total number of collisions made for each sample of arr.
Parameters
----------
arr: np.array
array to sample from.
sample_size: int
sample size.
n_samples: int
number of samples to take.
fast: bool
use sampling optimized for fast consecutive samples
from the same array.
Returns
-------
collisions: np.array
collision number for each sample taken
"""
samples = collect_samples(arr,
sample_size,
n_samples,
fast=fast).flatten()
# count collisions for *all* numbers we have sampled
counts = Counter(samples)
collision_sum = sum([count for k, count in counts.items() if count > 1])
return collision_sum
n = 10000
arr = np.array([i for i in range(n)])
# copy arrays so all experiments will be isolated in terms of array shuffling
arr_fast = arr.copy()
choice_num = []
fast_num = []
for i in tqdm(range(0, 500), desc="Collecting sample statistics"):
collisions = collect_collisions(arr, 1000, 10, fast=False)
collisions_fast = collect_collisions(arr_fast, 1000, 10, fast=True)
choice_num.append(collisions)
fast_num.append(collisions_fast)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment