Last active
April 13, 2018 01:37
-
-
Save CMCDragonkai/51ad0c6d42bf070d68e7997f8e2f95f3 to your computer and use it in GitHub Desktop.
Reservoir Sampling - Algorithm R #python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
def sample_reservoir(size, samples): | |
reservoir = [None] * size | |
k = size - 1 | |
for i in range(size): | |
reservoir[i] = samples[i] | |
for i in range(k + 1, len(samples)): | |
j = random.randint(0, i) | |
if j <= k: | |
reservoir[j] = samples[i] | |
return reservoir | |
# you can use this to deal with bigger-than-memory reservoir sets using a memory mapped map data structure | |
def sample_reservoir_lazy(size, samples): | |
for i in range(size): | |
# initial fill | |
yield (True, (i, next(samples))) | |
for (i, sample) in enumerate(samples): | |
j = random.randint(0, i + size) # this is inclusive | |
if j <= (size - 1): | |
# replacement | |
yield (True, (j, sample)) | |
else: | |
# skip | |
yield (False, sample) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment