Created
August 17, 2020 09:55
-
-
Save simonhayward/2c4245321c94502c3a27015aca90b48f to your computer and use it in GitHub Desktop.
reservoir sampling
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
https://florian.github.io/reservoir-sampling/ | |
- Store the first k elements as the reservoir elements | |
- For the i-th element, add it to the reservoir with a probability of k/i. | |
This is done by replacing a randomly selected element in the reservoir | |
""" | |
import random | |
import sys | |
def decision(probability): | |
r = random.random() | |
result = r < probability | |
if result is True: | |
print(f"chosen: {r} | {probability}") | |
return result | |
def sample(stream, k): | |
s = [] | |
for i, element in enumerate(stream, 1): | |
if i < (k+1): | |
s.append(element) | |
else: | |
if i == (k+1): | |
print(f"sample: {s}\n") | |
print(f"element: {element} | probability={k}/{i}") | |
if decision(k/i) is True: | |
index = random.randint(0, k-1) | |
print(f"replacing: {s[index]} with: {element}") | |
s[index] = element | |
print(f"sample: {s}\n") | |
return s | |
if __name__ == "__main__": | |
try: | |
k = int(sys.argv[1]) | |
except (IndexError, ValueError): | |
k = 3 | |
stream = random.sample(range(1, 1000), 10) | |
print(f"size: {k}") | |
print(f"stream: {stream}") | |
print(f"sample result: {sample(stream, k)}") |
Author
simonhayward
commented
Aug 17, 2020
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment