fabrizioc1/n_armed_bandit.py

## n_armed_bandit.py
from __future__ import division
import random
import numpy as np
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt


# epsilon values
EPSILON_VALUES = [0.0, 0.01, 0.1]
# number of plays per sample
N_PLAYS = 1000
# number of samples per epsilon
N_SAMPLES = 2000
# number of options (arms)
N_ARMS = 10
# the mean of the distribution from which the biases are drawn
BIAS_MEAN = 0.0
# the standard deviation of the distribution from which the biases are drawn
BIAS_SIGMA = 1.0
# the standard deviation of the random reward from the arms
ARM_SIGMA = 1.0
# initial reward estimate, high number to encourage exploration
INIT_REWARD = 0.0


total_rewards = np.zeros((N_PLAYS, len(EPSILON_VALUES)))
optimal_action = np.zeros((N_PLAYS, len(EPSILON_VALUES)))


# each model is a different epsilon value
for model in xrange(0, len(EPSILON_VALUES)):
    epsilon = EPSILON_VALUES[model]

    for sample in xrange(0, N_SAMPLES):
        # random bias per arm
        bias = np.random.normal(BIAS_MEAN, BIAS_SIGMA, N_ARMS)

        # find optimal index
        best_index = np.argmax(bias)

        # rewards for each arm
        rewards = np.zeros(N_ARMS) + INIT_REWARD

        # number of times each arm was played
        played = np.zeros(N_ARMS)

        for play in xrange(0, N_PLAYS):
            # determine if choice is stochastic
            if random.uniform(0, 1) > epsilon:
                # deterministic case:
                # select action with highest reward
                choice = np.argmax(rewards)
            else:
                # stochastic case:
                # perform exploratory action
                choice = random.randrange(N_ARMS)

            played[choice] += 1
            current_reward = bias[choice] + random.gauss(0.0, 1.0) + ARM_SIGMA
            total_rewards[play, model] += current_reward

            # is this the optimal choice?
            if choice == best_index:
                optimal_action[play, model] += 1

            # update estimated rewards
            if played[choice] >= 2:
                rewards[choice] = (rewards[choice] * (played[choice] - 1) + current_reward) / played[choice]
            else:
                rewards[choice] = current_reward


# percentage optimal action was chosen per play
percent_optimal_action = optimal_action / N_SAMPLES

# average reward per play
average_reward = total_rewards / N_SAMPLES


# plot results
x = np.arange(0, N_PLAYS)

fig = plt.figure(figsize=(15.0, 15.0))
fig.suptitle("%d-Armed Bandit" % N_ARMS)
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)

for model in range(0, len(EPSILON_VALUES)):
    epsilon = EPSILON_VALUES[model]

    y1 = percent_optimal_action[:,model]
    ax1.plot(x, y1, label='epsilon=%.3f' % epsilon)

    y2 = average_reward[:,model]
    ax2.plot(x, y2, label='epsilon=%.3f' % epsilon)

ax1.set_xlabel('Plays')
ax1.set_ylabel('% Optimal Action')
ax1.legend()

ax2.set_xlabel('Plays')
ax2.set_ylabel('Average Reward')
ax2.legend()

plt.show()
	from __future__ import division
	import random
	import numpy as np
	import matplotlib
	matplotlib.use('TkAgg')
	import matplotlib.pyplot as plt


	# epsilon values
	EPSILON_VALUES = [0.0, 0.01, 0.1]
	# number of plays per sample
	N_PLAYS = 1000
	# number of samples per epsilon
	N_SAMPLES = 2000
	# number of options (arms)
	N_ARMS = 10
	# the mean of the distribution from which the biases are drawn
	BIAS_MEAN = 0.0
	# the standard deviation of the distribution from which the biases are drawn
	BIAS_SIGMA = 1.0
	# the standard deviation of the random reward from the arms
	ARM_SIGMA = 1.0
	# initial reward estimate, high number to encourage exploration
	INIT_REWARD = 0.0


	total_rewards = np.zeros((N_PLAYS, len(EPSILON_VALUES)))
	optimal_action = np.zeros((N_PLAYS, len(EPSILON_VALUES)))


	# each model is a different epsilon value
	for model in xrange(0, len(EPSILON_VALUES)):
	epsilon = EPSILON_VALUES[model]

	for sample in xrange(0, N_SAMPLES):
	# random bias per arm
	bias = np.random.normal(BIAS_MEAN, BIAS_SIGMA, N_ARMS)

	# find optimal index
	best_index = np.argmax(bias)

	# rewards for each arm
	rewards = np.zeros(N_ARMS) + INIT_REWARD

	# number of times each arm was played
	played = np.zeros(N_ARMS)

	for play in xrange(0, N_PLAYS):
	# determine if choice is stochastic
	if random.uniform(0, 1) > epsilon:
	# deterministic case:
	# select action with highest reward
	choice = np.argmax(rewards)
	else:
	# stochastic case:
	# perform exploratory action
	choice = random.randrange(N_ARMS)

	played[choice] += 1
	current_reward = bias[choice] + random.gauss(0.0, 1.0) + ARM_SIGMA
	total_rewards[play, model] += current_reward

	# is this the optimal choice?
	if choice == best_index:
	optimal_action[play, model] += 1

	# update estimated rewards
	if played[choice] >= 2:
	rewards[choice] = (rewards[choice] * (played[choice] - 1) + current_reward) / played[choice]
	else:
	rewards[choice] = current_reward


	# percentage optimal action was chosen per play
	percent_optimal_action = optimal_action / N_SAMPLES

	# average reward per play
	average_reward = total_rewards / N_SAMPLES


	# plot results
	x = np.arange(0, N_PLAYS)

	fig = plt.figure(figsize=(15.0, 15.0))
	fig.suptitle("%d-Armed Bandit" % N_ARMS)
	ax1 = fig.add_subplot(211)
	ax2 = fig.add_subplot(212)

	for model in range(0, len(EPSILON_VALUES)):
	epsilon = EPSILON_VALUES[model]

	y1 = percent_optimal_action[:,model]
	ax1.plot(x, y1, label='epsilon=%.3f' % epsilon)

	y2 = average_reward[:,model]
	ax2.plot(x, y2, label='epsilon=%.3f' % epsilon)

	ax1.set_xlabel('Plays')
	ax1.set_ylabel('% Optimal Action')
	ax1.legend()

	ax2.set_xlabel('Plays')
	ax2.set_ylabel('Average Reward')
	ax2.legend()

	plt.show()