Instantly share code, notes, and snippets.

Embed
What would you like to do?
def generate_problem(k):
return np.random.normal(loc=0.0, scale=1, size=10)
def generate_reward(problem, action):
return np.random.normal(loc=problem[action], scale=1)
def k_bandit(problem, k, steps, exploration_rate):
Q = {i: 0 for i in range(k)} # 1. Value function
N = {i: 0 for i in range(k)} # 2. Number of actions, for update rule
for i in range(steps): # 3. Main loop
explore = random.uniform(0, 1) < exploration_rate
if explore:
action = random.randint(0, k - 1) # 5. Exploration: Choosing random action
else:
action = max(Q, key=Q.get) # 6. Choose action with maximum mean reward
reward = generate_reward(problem, action) # 7. Get reward for current action
N[action] += 1 # 8. Update action number
Q[action] += (1 / N[action]) * (reward - Q[action]) # 9. Update value dict
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment