timgianitsos/random_process_modes.py

## random_process_modes.py
from math import comb

import numpy as np
import matplotlib.pyplot as plt
from tqdm import trange

'''
There are `N` items. We select `n` (NOT necessarily unique) items per round with replacement. Assume `1 <= n`.
Consider the Random Process X_0, X_1, X_2, ....
Each `X_i` represents the number of unique items selected through round `i`.

For example:
Before the first selection, we would expect no unique items chosen. Thus, `P(X_0 = 0) = 1.0`
If `N` is much larger than `n`, we would expect all selections in the first round to be unique. Thus, `P(X_1 = n) ≈ 1.0`

What is the lowest value of `i` such that the Mode of `X_i` is `N`?
In other words, how many trials must occur in order for the most likely outcome to be the selection of
all the elements with none left over?

The following code simulates this with Monte Carlo by producing and displaying each R.V. in the random process.
It also displays the theoretical distribution of each R.V. inspired by the following PMF.
`https://stats.stackexchange.com/a/506373/251950`

It is worth noting that there seems to be a discrepancy (just by looking at a glance) between the theoretical
distribution and empirical distribution for later timesteps (but no discrepancy for earlier timesteps).
It is highly unlikely to me that this is a bug because very few of the simulations go to late timesteps
so those distributions should be much much noisier.
'''

def main():
    N = 100
    n = 50
    assert 1 <= n

    # We use this formula as a guess for the upper bound to reasonably expect a game
    # to last during simulation.
    # The formula is somewhat arbitrary - there is theoretically no upper
    # bound on how long a game can last before all items are selected as long as
    # selection is with replacement. However, large numbers are not encountered
    # in practice.
    num_Xs = (N // n + 1) * 30

    res = np.zeros((num_Xs, N + 1), dtype=np.int32)
    num_simulations = 50000
    res[0,0] += num_simulations # X_0 always has all probability on 0
    for _ in trange(num_simulations):
        trial = np.zeros(N, dtype=bool)
        trial_sum = 0
        i = 1
        while trial_sum < N:
            sample = np.random.randint(low=0, high=N, size=n, dtype=np.uint32)
            trial[sample] = True
            trial_sum = trial.sum()
            res[i, trial_sum] += 1
            i += 1

    # Find the first `i` to which none of the simulated games went
    norm = res.sum(axis=1).reshape(-1, 1)
    last_Xi = np.argmin(norm.flatten())
    res = res[:last_Xi]
    norm = norm[:last_Xi]

    assert not any(np.isclose(norm, 0))
    prob = res / norm # normalize as probabilities

    print(f'{N = }')
    print(f'{n = }')
    print(f'{num_simulations = }\n')
    print(f'Modes of each `X_i`: {list(np.argmax(res, axis=1))}\n')
    print(f'Means of each `X_i`: {list(float(f"{v:.5f}") for v in prob @ np.arange(N + 1))}\n')
    print(f'Percentage of games traversed through each `X_i`: {list(float(f"{v:.5f}") for v in norm.flatten() / num_simulations)}\n')
    print(f'Min number of trials for game to finish: {np.argmax(res[:,-1] != 0)}\n')
    print(f'For Mode to first achieve {N=}, number of trials required was: {np.argmax(np.argmax(res, axis=1))}\n')
    print(f'Max number of trials for game to finish: {len(norm)}\n')
    possible_Xis = np.where(~np.isclose(norm.flatten(), 0))[0]
    for ind in possible_Xis:

        # Zoom in on non-zero elements in the bar chart
        non_zero_x = ~np.isclose(res[ind], 0)
        dist_start = np.argmax(non_zero_x)
        dist_end = N - np.argmax(non_zero_x[::-1]) + 1
        r = np.arange(dist_start, dist_end)

        # Compute the theoretical distribution.
        # Inspired by `https://stats.stackexchange.com/a/506373/251950`
        #
        # It is necessary to use `int()` to convert from numpy
        # fixed-byte-length integers to Python's built-in `int` type
        # because the latter cannot overflow, and many of these
        # intermediate calculations produce very large numbers
        t = np.array([
            comb(N, int(x)) * sum(
                (-1) ** i * comb(int(x), i) * (int(x) - i) ** (n * int(ind))
                for i in range(int(x) + 1)
            ) / N ** (n * int(ind))
            for x in r
        ]) * norm[ind]

        # Plot
        fig, ax = plt.subplots()
        ax.set_ylabel("Probability")
        ax.set_xticks(r)
        ax2 = ax.twinx()
        ax2.set_ylabel("Simulation Occurrences")
        plt.title(f'Distribution of X_{ind}')
        plt.bar(r - 0.2, res[ind, dist_start:dist_end], width=0.4, label='Empirical', color='red')
        plt.bar(r + 0.2, t, width=0.4, label='Theoretical', color='blue')
        plt.show()

if __name__ == '__main__':
    main()
	from math import comb

	import numpy as np
	import matplotlib.pyplot as plt
	from tqdm import trange

	'''
	There are `N` items. We select `n` (NOT necessarily unique) items per round with replacement. Assume `1 <= n`.
	Consider the Random Process X_0, X_1, X_2, ....
	Each `X_i` represents the number of unique items selected through round `i`.

	For example:
	Before the first selection, we would expect no unique items chosen. Thus, `P(X_0 = 0) = 1.0`
	If `N` is much larger than `n`, we would expect all selections in the first round to be unique. Thus, `P(X_1 = n) ≈ 1.0`

	What is the lowest value of `i` such that the Mode of `X_i` is `N`?
	In other words, how many trials must occur in order for the most likely outcome to be the selection of
	all the elements with none left over?

	The following code simulates this with Monte Carlo by producing and displaying each R.V. in the random process.
	It also displays the theoretical distribution of each R.V. inspired by the following PMF.
	`https://stats.stackexchange.com/a/506373/251950`

	It is worth noting that there seems to be a discrepancy (just by looking at a glance) between the theoretical
	distribution and empirical distribution for later timesteps (but no discrepancy for earlier timesteps).
	It is highly unlikely to me that this is a bug because very few of the simulations go to late timesteps
	so those distributions should be much much noisier.
	'''

	def main():
	N = 100
	n = 50
	assert 1 <= n

	# We use this formula as a guess for the upper bound to reasonably expect a game
	# to last during simulation.
	# The formula is somewhat arbitrary - there is theoretically no upper
	# bound on how long a game can last before all items are selected as long as
	# selection is with replacement. However, large numbers are not encountered
	# in practice.
	num_Xs = (N // n + 1) * 30

	res = np.zeros((num_Xs, N + 1), dtype=np.int32)
	num_simulations = 50000
	res[0,0] += num_simulations # X_0 always has all probability on 0
	for _ in trange(num_simulations):
	trial = np.zeros(N, dtype=bool)
	trial_sum = 0
	i = 1
	while trial_sum < N:
	sample = np.random.randint(low=0, high=N, size=n, dtype=np.uint32)
	trial[sample] = True
	trial_sum = trial.sum()
	res[i, trial_sum] += 1
	i += 1

	# Find the first `i` to which none of the simulated games went
	norm = res.sum(axis=1).reshape(-1, 1)
	last_Xi = np.argmin(norm.flatten())
	res = res[:last_Xi]
	norm = norm[:last_Xi]

	assert not any(np.isclose(norm, 0))
	prob = res / norm # normalize as probabilities

	print(f'{N = }')
	print(f'{n = }')
	print(f'{num_simulations = }\n')
	print(f'Modes of each `X_i`: {list(np.argmax(res, axis=1))}\n')
	print(f'Means of each `X_i`: {list(float(f"{v:.5f}") for v in prob @ np.arange(N + 1))}\n')
	print(f'Percentage of games traversed through each `X_i`: {list(float(f"{v:.5f}") for v in norm.flatten() / num_simulations)}\n')
	print(f'Min number of trials for game to finish: {np.argmax(res[:,-1] != 0)}\n')
	print(f'For Mode to first achieve {N=}, number of trials required was: {np.argmax(np.argmax(res, axis=1))}\n')
	print(f'Max number of trials for game to finish: {len(norm)}\n')
	possible_Xis = np.where(~np.isclose(norm.flatten(), 0))[0]
	for ind in possible_Xis:

	# Zoom in on non-zero elements in the bar chart
	non_zero_x = ~np.isclose(res[ind], 0)
	dist_start = np.argmax(non_zero_x)
	dist_end = N - np.argmax(non_zero_x[::-1]) + 1
	r = np.arange(dist_start, dist_end)

	# Compute the theoretical distribution.
	# Inspired by `https://stats.stackexchange.com/a/506373/251950`
	#
	# It is necessary to use `int()` to convert from numpy
	# fixed-byte-length integers to Python's built-in `int` type
	# because the latter cannot overflow, and many of these
	# intermediate calculations produce very large numbers
	t = np.array([
	comb(N, int(x)) * sum(
	(-1) ** i * comb(int(x), i) * (int(x) - i) ** (n * int(ind))
	for i in range(int(x) + 1)
	) / N ** (n * int(ind))
	for x in r
	]) * norm[ind]

	# Plot
	fig, ax = plt.subplots()
	ax.set_ylabel("Probability")
	ax.set_xticks(r)
	ax2 = ax.twinx()
	ax2.set_ylabel("Simulation Occurrences")
	plt.title(f'Distribution of X_{ind}')
	plt.bar(r - 0.2, res[ind, dist_start:dist_end], width=0.4, label='Empirical', color='red')
	plt.bar(r + 0.2, t, width=0.4, label='Theoretical', color='blue')
	plt.show()

	if __name__ == '__main__':
	main()