eigenfoo/finite_support_bandit.py

## finite_support_bandit.py
def make_bandits(params):
    def pull(arm, size=None):
        while True:
            # Logit-normal distributed returns (or any distribution with finite support)
            # `expit` is the inverse of `logit`
            reward = expit(np.random.normal(loc=params[arm], scale=1, size=size))
            yield reward

    return pull, len(params)


def bayesian_strategy(pull, num_bandits):
    num_rewards = np.zeros(num_bandits)
    num_trials = np.zeros(num_bandits)

    while True:
        # Sample from the bandits' priors, and choose largest
        choice = np.argmax(np.random.beta(2+num_rewards,
                                          2+num_trials-num_rewards))

        # Sample the chosen bandit
        reward = next(pull(choice))

        # Sample a Bernoulli with probability of success = reward
        # Remember, reward is normalized to be in [0, 1]
        outcome = np.random.binomial(n=1, p=reward)

        # Update
        num_rewards[choice] += outcome
        num_trials[choice] += 1

        yield choice, reward, num_rewards, num_trials


if __name__ == '__main__':
    pull, num_bandits = make_bandits([0.2, 1.8, 2])
    play = bayesian_strategy(pull, num_bandits)

    for _ in range(100):
        choice, reward, num_rewards, num_trials = next(play)
	def make_bandits(params):
	def pull(arm, size=None):
	while True:
	# Logit-normal distributed returns (or any distribution with finite support)
	# `expit` is the inverse of `logit`
	reward = expit(np.random.normal(loc=params[arm], scale=1, size=size))
	yield reward

	return pull, len(params)


	def bayesian_strategy(pull, num_bandits):
	num_rewards = np.zeros(num_bandits)
	num_trials = np.zeros(num_bandits)

	while True:
	# Sample from the bandits' priors, and choose largest
	choice = np.argmax(np.random.beta(2+num_rewards,
	2+num_trials-num_rewards))

	# Sample the chosen bandit
	reward = next(pull(choice))

	# Sample a Bernoulli with probability of success = reward
	# Remember, reward is normalized to be in [0, 1]
	outcome = np.random.binomial(n=1, p=reward)

	# Update
	num_rewards[choice] += outcome
	num_trials[choice] += 1

	yield choice, reward, num_rewards, num_trials


	if __name__ == '__main__':
	pull, num_bandits = make_bandits([0.2, 1.8, 2])
	play = bayesian_strategy(pull, num_bandits)

	for _ in range(100):
	choice, reward, num_rewards, num_trials = next(play)