Skip to content

Instantly share code, notes, and snippets.

@mirqwa
Last active January 9, 2024 00:03
Show Gist options
  • Save mirqwa/bda6f72e04b6283a4948884f2a253cc9 to your computer and use it in GitHub Desktop.
Save mirqwa/bda6f72e04b6283a4948884f2a253cc9 to your computer and use it in GitHub Desktop.
10-armed bandit stationary
import numpy as np
import matplotlib.pyplot as plt
VARIANCES = [1, 0, 5, 10]
EPSILONS = {
0: {"color": "-g"},
0.1: {"color": "-b"},
0.01: {"color": "-r"},
0.5: {"color": "-y"},
0.2: {"color": "-k", "decay": True, "label": "0.2 with decay"},
}
def epsilon_greedy_action(action_values: np.array, epsilon: float) -> int:
if np.random.random() <= epsilon:
return np.random.choice(10)
return np.argmax(action_values)
def get_epsilons(epsilon, decay, steps):
epsilons = []
if decay:
epsilon_delta = (epsilon - 0) / steps
for i in range(steps):
epsilon -= epsilon_delta
epsilons.append(epsilon)
return epsilons
def run_single_experiment(epsilon: float, decay=False, variance=1):
std = np.sqrt(variance)
mu = np.random.normal(0, 1, 10)
optimal_action = mu.argmax(axis=0)
action_values = np.zeros(10)
action_counts = np.zeros(10)
steps = 1000
epsilons = get_epsilons(epsilon, decay, steps)
rewards = []
optimal_action_count = 0
optimal_action_percentage = []
for i in range(steps):
epsilon = epsilons[i] if epsilons else epsilon
if i < 50 and std == 0 and epsilon == 0:
selected_action = np.random.choice(10)
else:
selected_action = epsilon_greedy_action(action_values, epsilon)
if selected_action == optimal_action:
optimal_action_count += 1
actual_reward = np.random.normal(mu[selected_action], std)
action_counts[selected_action] += 1
action_values[selected_action] += (
actual_reward - action_values[selected_action]
) / float(action_counts[selected_action])
optimal_action_percentage.append(optimal_action_count / (i + 1) * 100)
rewards.append(action_values[selected_action])
return rewards, optimal_action_percentage
def run_experiments(epsilon: float, decay=False, variance=1):
all_rewards = []
optimal_percentages = []
for _ in range(2000):
rewards, optimal_action_percentage = run_single_experiment(
epsilon, decay=decay, variance=variance
)
all_rewards.append(rewards)
optimal_percentages.append(optimal_action_percentage)
all_rewards = np.array(all_rewards)
optimal_percentages = np.array(optimal_percentages)
return all_rewards.mean(axis=0), optimal_percentages.mean(axis=0)
def plot_results(all_values, values_key, title, ylabel, filename):
for values in all_values:
values_to_plot = values[values_key]
props = values["props"]
x = np.arange(1, len(values_to_plot) + 1)
plt.plot(
x,
values_to_plot,
props["color"],
label=props.get("label", f"ε={values['epsilon']}"),
)
plt.title(title)
plt.xlabel("Steps")
plt.ylabel(ylabel)
plt.legend()
plt.savefig(filename, dpi=199)
plt.show()
def run_and_plot_results(variance=1):
all_rewards = []
all_optimal_percentages = []
for epsilon, props in EPSILONS.items():
if props.get("decay"):
rewards, optimal_percentages = run_experiments(
epsilon, decay=True, variance=variance
)
else:
rewards, optimal_percentages = run_experiments(epsilon, variance=variance)
all_rewards.append({"rewards": rewards, "props": props, "epsilon": epsilon})
all_optimal_percentages.append(
{
"optimal_percentages": optimal_percentages,
"props": props,
"epsilon": epsilon,
}
)
directory = "plots/bandits/stationary"
plot_results(
all_rewards,
"rewards",
f"Rewards with Variance={variance}",
"Average reward",
f"{directory}/rewards_var_{variance}.png",
)
plot_results(
all_optimal_percentages,
"optimal_percentages",
f"Optimal Percentages with Variance={variance}",
"% Optimal action",
f"{directory}/optimal_percentages_var_{variance}.png",
)
if __name__ == "__main__":
for variance in VARIANCES:
run_and_plot_results(variance)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment