Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Ten Armed Bandit
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import random
class TenArmedBandit(object):
def __init__(self):
self.action_space = 10
self.q_true = np.random.randn(self.action_space)
self.t = 0
def step(self, action):
noise = np.random.randn(1)[0]
self.t += 1
return self.q_true[action] + noise
class Agent(object):
def __init__(self, action_space, eps=None):
self.Q = np.zeros(action_space)
self.K = np.zeros(action_space)
self.eps = eps
self.t = 0
def act(self):
if self.eps != None and self.eps > random.random():
return np.random.randint(0,10)
a = np.argmax(self.Q)
return a
def observe(self, action, reward):
self.K[action] += 1
Ka = self.K[action]
Qa = self.Q[action]
self.Q[action] = (Qa * (Ka-1) + reward) / Ka
self.t += 1
def learn(steps, bandits, eps=None):
rewards = np.zeros(steps)
for i in range(bandits):
game = TenArmedBandit()
agent = Agent(10, eps=eps)
for i in range(steps):
action = agent.act()
reward = game.step(action)
rewards[i] += reward
agent.observe(action, reward)
return rewards / bandits
steps = np.arange(0, rewards.size)
eps0 = learn(1000, 2000, eps=None)
eps001 = learn(1000, 2000, eps=0.01)
eps01 = learn(1000, 2000, eps=0.1)
plt.plot(steps, eps0, 'r', steps, eps001, 'b', steps, eps01, 'g')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.