Created
February 9, 2021 05:48
-
-
Save LiutongZhou/dc453d207ce9d9f602021281c09a4d98 to your computer and use it in GitHub Desktop.
Time-and-Space-Efficient Sampling Methods
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Time-and-Space-Efficient Sampling Methods""" | |
from typing import Iterable | |
from random import random, randint, seed | |
from math import exp, log, floor | |
from itertools import islice | |
__author__ = "Liutong Zhou" | |
def take(iterable, n: int) -> list: | |
"Return first n items of the iterable as a list" | |
return list(islice(iterable, n)) | |
def nth(iterable, n: int, *, default=None): | |
"Returns the nth item or a default value" | |
return next(islice(iterable, n, None), default) | |
def fast_array_sampling(): | |
"""Generate random samples of size k with given probability distribution | |
`Method 3 of this article <https://medium.com/ibm-watson/incredibly-fast-random-sampling-in-python-baf154bd836a>`_ | |
""" | |
raise NotImplementedError | |
def reservoir_sampling( | |
stream: Iterable, /, k: int = 1, *, random_seed=0, n=None | |
) -> list: | |
raise NotImplementedError | |
def reservoir_sampling_k(stream: Iterable, /, k: int = 1, *, random_seed=None) -> list: | |
"""Randomly sample k elements from a stream | |
Each element has equal probability :math:`k/n` to be sampled | |
Parameters | |
-------- | |
stream : Iterable | |
a stream of items to sample | |
k : int | |
sample size | |
random_seed : optional | |
Notes | |
-------- | |
This is an :math:`O(n)` asymptotic time complexity and :math:`O(k)` space complexity implementation | |
see `pseudo code <https://en.wikipedia.org/wiki/Reservoir_sampling>`_. | |
References | |
-------- | |
.. [1] Review this article for an `explaination of reservoir sampling <https://florian.github.io/reservoir-sampling/>`_ | |
""" | |
seed(random_seed) | |
reservoir = [] | |
for i, element in enumerate(stream): | |
if i < k: | |
reservoir.append(element) | |
else: | |
if (j := randint(0, i)) < k: | |
reservoir[j] = element | |
return reservoir | |
def reservoir_sampling_k_from_n( | |
stream: Iterable, n: int, /, k: int = 1, *, random_seed=None | |
) -> list: | |
"""Randomly sample k elements from a stream of n observations | |
Parameters | |
-------- | |
stream : Iterable | |
a stream of items to sample from | |
n : int | |
sample from the first n observations of the stream | |
k : int | |
sample size | |
random_seed : optional | |
Notes | |
-------- | |
This is an :math:`O(ln n)` asymptotic time complexity | |
and :math:`O(k)` space complexity implementation. | |
see `pseudo code <https://en.wikipedia.org/wiki/Reservoir_sampling>`_. | |
References | |
-------- | |
.. [1] Review this article For an `explaination of reservoir sampling <https://florian.github.io/reservoir-sampling/>`_ | |
""" | |
seed(random_seed) | |
stream = iter(stream) | |
# fill reservoir with the first k elements of the stream | |
reservoir: list = take(stream, k) | |
w = exp(log(random()) / k) | |
j = i = k | |
while i <= n: | |
i += 1 + floor(log(random()) / log(1 - w)) | |
if i <= n: | |
while j < i: | |
element_ith = next(stream) | |
j += 1 | |
reservoir[randint(0, k - 1)] = element_ith | |
w *= exp(log(random()) / k) | |
return reservoir |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment