jknthn/q-learning.py

## q-learning.py
def Q_learning(env, episodes=100, step_size=0.01, exploration_rate=0.01):
    policy = utils.create_random_policy(env) # Create policy, just for the util function to create Q
    Q = create_state_action_dictionary(env, policy) # 1. Initialize value dictionary formated: { S1: { A1: 0.0, A2: 0.0, ...}, ...}

    # 2. Loop through the number of episodes
    for episode in range(episodes):
        env.reset() # Gym environment reset
        S = env.env.s # 3. Getting State
        finished = False

        # 4. Looping to the end of the episode
        while not finished:
            A = greedy_policy(Q)[S] # 5. Deciding on the action
            S_prime, reward, finished, _ = env.step(A) # 6. Making next step
            Q[S][A] = Q[S][A] + step_size * (reward + exploration_rate * max(Q[S_prime].values()) - Q[S][A]) # 7. Update rule
            S = S_prime # 8. Update State for the next step

    return greedy_policy(Q), Q
	def Q_learning(env, episodes=100, step_size=0.01, exploration_rate=0.01):
	policy = utils.create_random_policy(env) # Create policy, just for the util function to create Q
	Q = create_state_action_dictionary(env, policy) # 1. Initialize value dictionary formated: { S1: { A1: 0.0, A2: 0.0, ...}, ...}

	# 2. Loop through the number of episodes
	for episode in range(episodes):
	env.reset() # Gym environment reset
	S = env.env.s # 3. Getting State
	finished = False

	# 4. Looping to the end of the episode
	while not finished:
	A = greedy_policy(Q)[S] # 5. Deciding on the action
	S_prime, reward, finished, _ = env.step(A) # 6. Making next step
	Q[S][A] = Q[S][A] + step_size * (reward + exploration_rate * max(Q[S_prime].values()) - Q[S][A]) # 7. Update rule
	S = S_prime # 8. Update State for the next step

	return greedy_policy(Q), Q