Skip to content

Instantly share code, notes, and snippets.

@eigenfoo
Last active August 31, 2018 14:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eigenfoo/7a397fef8aaa028c5119c9f86860d72e to your computer and use it in GitHub Desktop.
Save eigenfoo/7a397fef8aaa028c5119c9f86860d72e to your computer and use it in GitHub Desktop.
Solving finitely-supported multi-armed bandit with Thompson sampling
def make_bandits(params):
def pull(arm, size=None):
while True:
# Logit-normal distributed returns (or any distribution with finite support)
# `expit` is the inverse of `logit`
reward = expit(np.random.normal(loc=params[arm], scale=1, size=size))
yield reward
return pull, len(params)
def bayesian_strategy(pull, num_bandits):
num_rewards = np.zeros(num_bandits)
num_trials = np.zeros(num_bandits)
while True:
# Sample from the bandits' priors, and choose largest
choice = np.argmax(np.random.beta(2+num_rewards,
2+num_trials-num_rewards))
# Sample the chosen bandit
reward = next(pull(choice))
# Sample a Bernoulli with probability of success = reward
# Remember, reward is normalized to be in [0, 1]
outcome = np.random.binomial(n=1, p=reward)
# Update
num_rewards[choice] += outcome
num_trials[choice] += 1
yield choice, reward, num_rewards, num_trials
if __name__ == '__main__':
pull, num_bandits = make_bandits([0.2, 1.8, 2])
play = bayesian_strategy(pull, num_bandits)
for _ in range(100):
choice, reward, num_rewards, num_trials = next(play)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment