jknthn/double-q-learning.py

## double-q-learning.py
def double_Q_learning(env, episodes=100, step_size=0.01, exploration_rate=0.01):
    policy = utils.create_random_policy(env) # Create policy, just for the util function to create Q

    # 1. Initialize value dictionaries formated: { S1: { A1: 0.0, A2: 0.0, ...}, ...}
    Q_1 = create_state_action_dictionary(env, policy)
    Q_2 = create_state_action_dictionary(env, policy)

    # 2. Loop through the number of episodes
    for episode in range(episodes):
        env.reset() # Gym environment reset
        S = env.env.s # 3. Getting State
        finished = False

        # 4. Looping to the end of the episode
        while not finished:
            Q = {s: {a: av + Q_2[s][a] for a, av in sv.items()} for s, sv in Q_1.items()} # 5. Adding dictionaries to crete policy
            A = greedy_policy(Q)[S] # 6. Deciding on the action
            S_prime, reward, finished, _ = env.step(A) # 7. Making next step

            # 8. 50% chance
            if np.random.uniform() < 0.5:
                Q_1[S][A] = Q_1[S][A] + step_size * (reward + exploration_rate * max(Q_2[S_prime].values()) - Q_1[S][A]) # 9. Update rule
            else:
                Q_2[S][A] = Q_2[S][A] + step_size * (reward + exploration_rate * max(Q_1[S_prime].values()) - Q_2[S][A]) # 9. Update rule

            # 10. Update State for the next step
            S = S_prime

    Q = {s: {a: av + Q_2[s][a] for a, av in sv.items()} for s, sv in Q_1.items()}
    return greedy_policy(Q), Q
	def double_Q_learning(env, episodes=100, step_size=0.01, exploration_rate=0.01):
	policy = utils.create_random_policy(env) # Create policy, just for the util function to create Q

	# 1. Initialize value dictionaries formated: { S1: { A1: 0.0, A2: 0.0, ...}, ...}
	Q_1 = create_state_action_dictionary(env, policy)
	Q_2 = create_state_action_dictionary(env, policy)

	# 2. Loop through the number of episodes
	for episode in range(episodes):
	env.reset() # Gym environment reset
	S = env.env.s # 3. Getting State
	finished = False

	# 4. Looping to the end of the episode
	while not finished:
	Q = {s: {a: av + Q_2[s][a] for a, av in sv.items()} for s, sv in Q_1.items()} # 5. Adding dictionaries to crete policy
	A = greedy_policy(Q)[S] # 6. Deciding on the action
	S_prime, reward, finished, _ = env.step(A) # 7. Making next step

	# 8. 50% chance
	if np.random.uniform() < 0.5:
	Q_1[S][A] = Q_1[S][A] + step_size * (reward + exploration_rate * max(Q_2[S_prime].values()) - Q_1[S][A]) # 9. Update rule
	else:
	Q_2[S][A] = Q_2[S][A] + step_size * (reward + exploration_rate * max(Q_1[S_prime].values()) - Q_2[S][A]) # 9. Update rule

	# 10. Update State for the next step
	S = S_prime

	Q = {s: {a: av + Q_2[s][a] for a, av in sv.items()} for s, sv in Q_1.items()}
	return greedy_policy(Q), Q