Last active
December 6, 2022 06:44
-
-
Save timgianitsos/28d5f1487323a13a30b2ff65a3cc89ac to your computer and use it in GitHub Desktop.
Random Process Modes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from math import comb | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from tqdm import trange | |
''' | |
There are `N` items. We select `n` (NOT necessarily unique) items per round with replacement. Assume `1 <= n`. | |
Consider the Random Process X_0, X_1, X_2, .... | |
Each `X_i` represents the number of unique items selected through round `i`. | |
For example: | |
Before the first selection, we would expect no unique items chosen. Thus, `P(X_0 = 0) = 1.0` | |
If `N` is much larger than `n`, we would expect all selections in the first round to be unique. Thus, `P(X_1 = n) ≈ 1.0` | |
What is the lowest value of `i` such that the Mode of `X_i` is `N`? | |
In other words, how many trials must occur in order for the most likely outcome to be the selection of | |
all the elements with none left over? | |
The following code simulates this with Monte Carlo by producing and displaying each R.V. in the random process. | |
It also displays the theoretical distribution of each R.V. inspired by the following PMF. | |
`https://stats.stackexchange.com/a/506373/251950` | |
It is worth noting that there seems to be a discrepancy (just by looking at a glance) between the theoretical | |
distribution and empirical distribution for later timesteps (but no discrepancy for earlier timesteps). | |
It is highly unlikely to me that this is a bug because very few of the simulations go to late timesteps | |
so those distributions should be much much noisier. | |
''' | |
def main(): | |
N = 100 | |
n = 50 | |
assert 1 <= n | |
# We use this formula as a guess for the upper bound to reasonably expect a game | |
# to last during simulation. | |
# The formula is somewhat arbitrary - there is theoretically no upper | |
# bound on how long a game can last before all items are selected as long as | |
# selection is with replacement. However, large numbers are not encountered | |
# in practice. | |
num_Xs = (N // n + 1) * 30 | |
res = np.zeros((num_Xs, N + 1), dtype=np.int32) | |
num_simulations = 50000 | |
res[0,0] += num_simulations # X_0 always has all probability on 0 | |
for _ in trange(num_simulations): | |
trial = np.zeros(N, dtype=bool) | |
trial_sum = 0 | |
i = 1 | |
while trial_sum < N: | |
sample = np.random.randint(low=0, high=N, size=n, dtype=np.uint32) | |
trial[sample] = True | |
trial_sum = trial.sum() | |
res[i, trial_sum] += 1 | |
i += 1 | |
# Find the first `i` to which none of the simulated games went | |
norm = res.sum(axis=1).reshape(-1, 1) | |
last_Xi = np.argmin(norm.flatten()) | |
res = res[:last_Xi] | |
norm = norm[:last_Xi] | |
assert not any(np.isclose(norm, 0)) | |
prob = res / norm # normalize as probabilities | |
print(f'{N = }') | |
print(f'{n = }') | |
print(f'{num_simulations = }\n') | |
print(f'Modes of each `X_i`: {list(np.argmax(res, axis=1))}\n') | |
print(f'Means of each `X_i`: {list(float(f"{v:.5f}") for v in prob @ np.arange(N + 1))}\n') | |
print(f'Percentage of games traversed through each `X_i`: {list(float(f"{v:.5f}") for v in norm.flatten() / num_simulations)}\n') | |
print(f'Min number of trials for game to finish: {np.argmax(res[:,-1] != 0)}\n') | |
print(f'For Mode to first achieve {N=}, number of trials required was: {np.argmax(np.argmax(res, axis=1))}\n') | |
print(f'Max number of trials for game to finish: {len(norm)}\n') | |
possible_Xis = np.where(~np.isclose(norm.flatten(), 0))[0] | |
for ind in possible_Xis: | |
# Zoom in on non-zero elements in the bar chart | |
non_zero_x = ~np.isclose(res[ind], 0) | |
dist_start = np.argmax(non_zero_x) | |
dist_end = N - np.argmax(non_zero_x[::-1]) + 1 | |
r = np.arange(dist_start, dist_end) | |
# Compute the theoretical distribution. | |
# Inspired by `https://stats.stackexchange.com/a/506373/251950` | |
# | |
# It is necessary to use `int()` to convert from numpy | |
# fixed-byte-length integers to Python's built-in `int` type | |
# because the latter cannot overflow, and many of these | |
# intermediate calculations produce very large numbers | |
t = np.array([ | |
comb(N, int(x)) * sum( | |
(-1) ** i * comb(int(x), i) * (int(x) - i) ** (n * int(ind)) | |
for i in range(int(x) + 1) | |
) / N ** (n * int(ind)) | |
for x in r | |
]) * norm[ind] | |
# Plot | |
fig, ax = plt.subplots() | |
ax.set_ylabel("Probability") | |
ax.set_xticks(r) | |
ax2 = ax.twinx() | |
ax2.set_ylabel("Simulation Occurrences") | |
plt.title(f'Distribution of X_{ind}') | |
plt.bar(r - 0.2, res[ind, dist_start:dist_end], width=0.4, label='Empirical', color='red') | |
plt.bar(r + 0.2, t, width=0.4, label='Theoretical', color='blue') | |
plt.show() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment