Skip to content

Instantly share code, notes, and snippets.

@jknthn
Last active June 18, 2018 22:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jknthn/640aba16194bf03cb4c9846cf9b8fa08 to your computer and use it in GitHub Desktop.
Save jknthn/640aba16194bf03cb4c9846cf9b8fa08 to your computer and use it in GitHub Desktop.
def double_Q_learning(env, episodes=100, step_size=0.01, exploration_rate=0.01):
policy = utils.create_random_policy(env) # Create policy, just for the util function to create Q
# 1. Initialize value dictionaries formated: { S1: { A1: 0.0, A2: 0.0, ...}, ...}
Q_1 = create_state_action_dictionary(env, policy)
Q_2 = create_state_action_dictionary(env, policy)
# 2. Loop through the number of episodes
for episode in range(episodes):
env.reset() # Gym environment reset
S = env.env.s # 3. Getting State
finished = False
# 4. Looping to the end of the episode
while not finished:
Q = {s: {a: av + Q_2[s][a] for a, av in sv.items()} for s, sv in Q_1.items()} # 5. Adding dictionaries to crete policy
A = greedy_policy(Q)[S] # 6. Deciding on the action
S_prime, reward, finished, _ = env.step(A) # 7. Making next step
# 8. 50% chance
if np.random.uniform() < 0.5:
Q_1[S][A] = Q_1[S][A] + step_size * (reward + exploration_rate * max(Q_2[S_prime].values()) - Q_1[S][A]) # 9. Update rule
else:
Q_2[S][A] = Q_2[S][A] + step_size * (reward + exploration_rate * max(Q_1[S_prime].values()) - Q_2[S][A]) # 9. Update rule
# 10. Update State for the next step
S = S_prime
Q = {s: {a: av + Q_2[s][a] for a, av in sv.items()} for s, sv in Q_1.items()}
return greedy_policy(Q), Q
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment