Skip to content

Instantly share code, notes, and snippets.

@timgianitsos
Last active December 6, 2022 06:44
Show Gist options
  • Save timgianitsos/28d5f1487323a13a30b2ff65a3cc89ac to your computer and use it in GitHub Desktop.
Save timgianitsos/28d5f1487323a13a30b2ff65a3cc89ac to your computer and use it in GitHub Desktop.
Random Process Modes
from math import comb
import numpy as np
import matplotlib.pyplot as plt
from tqdm import trange
'''
There are `N` items. We select `n` (NOT necessarily unique) items per round with replacement. Assume `1 <= n`.
Consider the Random Process X_0, X_1, X_2, ....
Each `X_i` represents the number of unique items selected through round `i`.
For example:
Before the first selection, we would expect no unique items chosen. Thus, `P(X_0 = 0) = 1.0`
If `N` is much larger than `n`, we would expect all selections in the first round to be unique. Thus, `P(X_1 = n) ≈ 1.0`
What is the lowest value of `i` such that the Mode of `X_i` is `N`?
In other words, how many trials must occur in order for the most likely outcome to be the selection of
all the elements with none left over?
The following code simulates this with Monte Carlo by producing and displaying each R.V. in the random process.
It also displays the theoretical distribution of each R.V. inspired by the following PMF.
`https://stats.stackexchange.com/a/506373/251950`
It is worth noting that there seems to be a discrepancy (just by looking at a glance) between the theoretical
distribution and empirical distribution for later timesteps (but no discrepancy for earlier timesteps).
It is highly unlikely to me that this is a bug because very few of the simulations go to late timesteps
so those distributions should be much much noisier.
'''
def main():
N = 100
n = 50
assert 1 <= n
# We use this formula as a guess for the upper bound to reasonably expect a game
# to last during simulation.
# The formula is somewhat arbitrary - there is theoretically no upper
# bound on how long a game can last before all items are selected as long as
# selection is with replacement. However, large numbers are not encountered
# in practice.
num_Xs = (N // n + 1) * 30
res = np.zeros((num_Xs, N + 1), dtype=np.int32)
num_simulations = 50000
res[0,0] += num_simulations # X_0 always has all probability on 0
for _ in trange(num_simulations):
trial = np.zeros(N, dtype=bool)
trial_sum = 0
i = 1
while trial_sum < N:
sample = np.random.randint(low=0, high=N, size=n, dtype=np.uint32)
trial[sample] = True
trial_sum = trial.sum()
res[i, trial_sum] += 1
i += 1
# Find the first `i` to which none of the simulated games went
norm = res.sum(axis=1).reshape(-1, 1)
last_Xi = np.argmin(norm.flatten())
res = res[:last_Xi]
norm = norm[:last_Xi]
assert not any(np.isclose(norm, 0))
prob = res / norm # normalize as probabilities
print(f'{N = }')
print(f'{n = }')
print(f'{num_simulations = }\n')
print(f'Modes of each `X_i`: {list(np.argmax(res, axis=1))}\n')
print(f'Means of each `X_i`: {list(float(f"{v:.5f}") for v in prob @ np.arange(N + 1))}\n')
print(f'Percentage of games traversed through each `X_i`: {list(float(f"{v:.5f}") for v in norm.flatten() / num_simulations)}\n')
print(f'Min number of trials for game to finish: {np.argmax(res[:,-1] != 0)}\n')
print(f'For Mode to first achieve {N=}, number of trials required was: {np.argmax(np.argmax(res, axis=1))}\n')
print(f'Max number of trials for game to finish: {len(norm)}\n')
possible_Xis = np.where(~np.isclose(norm.flatten(), 0))[0]
for ind in possible_Xis:
# Zoom in on non-zero elements in the bar chart
non_zero_x = ~np.isclose(res[ind], 0)
dist_start = np.argmax(non_zero_x)
dist_end = N - np.argmax(non_zero_x[::-1]) + 1
r = np.arange(dist_start, dist_end)
# Compute the theoretical distribution.
# Inspired by `https://stats.stackexchange.com/a/506373/251950`
#
# It is necessary to use `int()` to convert from numpy
# fixed-byte-length integers to Python's built-in `int` type
# because the latter cannot overflow, and many of these
# intermediate calculations produce very large numbers
t = np.array([
comb(N, int(x)) * sum(
(-1) ** i * comb(int(x), i) * (int(x) - i) ** (n * int(ind))
for i in range(int(x) + 1)
) / N ** (n * int(ind))
for x in r
]) * norm[ind]
# Plot
fig, ax = plt.subplots()
ax.set_ylabel("Probability")
ax.set_xticks(r)
ax2 = ax.twinx()
ax2.set_ylabel("Simulation Occurrences")
plt.title(f'Distribution of X_{ind}')
plt.bar(r - 0.2, res[ind, dist_start:dist_end], width=0.4, label='Empirical', color='red')
plt.bar(r + 0.2, t, width=0.4, label='Theoretical', color='blue')
plt.show()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment