Skip to content

Instantly share code, notes, and snippets.

@LiutongZhou
Created February 9, 2021 05:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save LiutongZhou/dc453d207ce9d9f602021281c09a4d98 to your computer and use it in GitHub Desktop.
Save LiutongZhou/dc453d207ce9d9f602021281c09a4d98 to your computer and use it in GitHub Desktop.
Time-and-Space-Efficient Sampling Methods
""" Time-and-Space-Efficient Sampling Methods"""
from typing import Iterable
from random import random, randint, seed
from math import exp, log, floor
from itertools import islice
__author__ = "Liutong Zhou"
def take(iterable, n: int) -> list:
"Return first n items of the iterable as a list"
return list(islice(iterable, n))
def nth(iterable, n: int, *, default=None):
"Returns the nth item or a default value"
return next(islice(iterable, n, None), default)
def fast_array_sampling():
"""Generate random samples of size k with given probability distribution
`Method 3 of this article <https://medium.com/ibm-watson/incredibly-fast-random-sampling-in-python-baf154bd836a>`_
"""
raise NotImplementedError
def reservoir_sampling(
stream: Iterable, /, k: int = 1, *, random_seed=0, n=None
) -> list:
raise NotImplementedError
def reservoir_sampling_k(stream: Iterable, /, k: int = 1, *, random_seed=None) -> list:
"""Randomly sample k elements from a stream
Each element has equal probability :math:`k/n` to be sampled
Parameters
--------
stream : Iterable
a stream of items to sample
k : int
sample size
random_seed : optional
Notes
--------
This is an :math:`O(n)` asymptotic time complexity and :math:`O(k)` space complexity implementation
see `pseudo code <https://en.wikipedia.org/wiki/Reservoir_sampling>`_.
References
--------
.. [1] Review this article for an `explaination of reservoir sampling <https://florian.github.io/reservoir-sampling/>`_
"""
seed(random_seed)
reservoir = []
for i, element in enumerate(stream):
if i < k:
reservoir.append(element)
else:
if (j := randint(0, i)) < k:
reservoir[j] = element
return reservoir
def reservoir_sampling_k_from_n(
stream: Iterable, n: int, /, k: int = 1, *, random_seed=None
) -> list:
"""Randomly sample k elements from a stream of n observations
Parameters
--------
stream : Iterable
a stream of items to sample from
n : int
sample from the first n observations of the stream
k : int
sample size
random_seed : optional
Notes
--------
This is an :math:`O(ln n)` asymptotic time complexity
and :math:`O(k)` space complexity implementation.
see `pseudo code <https://en.wikipedia.org/wiki/Reservoir_sampling>`_.
References
--------
.. [1] Review this article For an `explaination of reservoir sampling <https://florian.github.io/reservoir-sampling/>`_
"""
seed(random_seed)
stream = iter(stream)
# fill reservoir with the first k elements of the stream
reservoir: list = take(stream, k)
w = exp(log(random()) / k)
j = i = k
while i <= n:
i += 1 + floor(log(random()) / log(1 - w))
if i <= n:
while j < i:
element_ith = next(stream)
j += 1
reservoir[randint(0, k - 1)] = element_ith
w *= exp(log(random()) / k)
return reservoir
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment