Skip to content

Instantly share code, notes, and snippets.

@jknthn
Created October 25, 2018 08:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jknthn/74cfe9e94bf59134f3a4958979591a99 to your computer and use it in GitHub Desktop.
Save jknthn/74cfe9e94bf59134f3a4958979591a99 to your computer and use it in GitHub Desktop.
def generate_problem(k):
return np.random.normal(loc=0.0, scale=1, size=10)
def generate_reward(problem, action):
return np.random.normal(loc=problem[action], scale=1)
def k_bandit(problem, k, steps, exploration_rate):
Q = {i: 0 for i in range(k)} # 1. Value function
N = {i: 0 for i in range(k)} # 2. Number of actions, for update rule
for i in range(steps): # 3. Main loop
explore = random.uniform(0, 1) < exploration_rate
if explore:
action = random.randint(0, k - 1) # 5. Exploration: Choosing random action
else:
action = max(Q, key=Q.get) # 6. Choose action with maximum mean reward
reward = generate_reward(problem, action) # 7. Get reward for current action
N[action] += 1 # 8. Update action number
Q[action] += (1 / N[action]) * (reward - Q[action]) # 9. Update value dict
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment