Last active
July 6, 2022 13:31
-
-
Save mick001/f53dcd51ff59817783a4b505a4835453 to your computer and use it in GitHub Desktop.
Q-learning in Python. Simple example. Full article at: https://firsttimeprogrammer.blogspot.com/2016/09/getting-ai-smarter-with-q-learning.html#more
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
# R matrix | |
R = np.matrix([ [-1,-1,-1,-1,0,-1], | |
[-1,-1,-1,0,-1,100], | |
[-1,-1,-1,0,-1,-1], | |
[-1,0,0,-1,0,-1], | |
[-1,0,0,-1,-1,100], | |
[-1,0,-1,-1,0,100] ]) | |
# Q matrix | |
Q = np.matrix(np.zeros([6,6])) | |
# Gamma (learning parameter). | |
gamma = 0.8 | |
# Initial state. (Usually to be chosen at random) | |
initial_state = 1 | |
# This function returns all available actions in the state given as an argument | |
def available_actions(state): | |
current_state_row = R[state,] | |
av_act = np.where(current_state_row >= 0)[1] | |
return av_act | |
# Get available actions in the current state | |
available_act = available_actions(initial_state) | |
# This function chooses at random which action to be performed within the range | |
# of all the available actions. | |
def sample_next_action(available_actions_range): | |
next_action = int(np.random.choice(available_act,1)) | |
return next_action | |
# Sample next action to be performed | |
action = sample_next_action(available_act) | |
# This function updates the Q matrix according to the path selected and the Q | |
# learning algorithm | |
def update(current_state, action, gamma): | |
max_index = np.where(Q[action,] == np.max(Q[action,]))[1] | |
if max_index.shape[0] > 1: | |
max_index = int(np.random.choice(max_index, size = 1)) | |
else: | |
max_index = int(max_index) | |
max_value = Q[action, max_index] | |
# Q learning formula | |
Q[current_state, action] = R[current_state, action] + gamma * max_value | |
# Update Q matrix | |
update(initial_state,action,gamma) | |
#------------------------------------------------------------------------------- | |
# Training | |
# Train over 10 000 iterations. (Re-iterate the process above). | |
for i in range(10000): | |
current_state = np.random.randint(0, int(Q.shape[0])) | |
available_act = available_actions(current_state) | |
action = sample_next_action(available_act) | |
update(current_state,action,gamma) | |
# Normalize the "trained" Q matrix | |
print("Trained Q matrix:") | |
print(Q/np.max(Q)*100) | |
#------------------------------------------------------------------------------- | |
# Testing | |
# Goal state = 5 | |
# Best sequence path starting from 2 -> 2, 3, 1, 5 | |
current_state = 2 | |
steps = [current_state] | |
while current_state != 5: | |
next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1] | |
if next_step_index.shape[0] > 1: | |
next_step_index = int(np.random.choice(next_step_index, size = 1)) | |
else: | |
next_step_index = int(next_step_index) | |
steps.append(next_step_index) | |
current_state = next_step_index | |
# Print selected sequence of steps | |
print("Selected path:") | |
print(steps) | |
#------------------------------------------------------------------------------- | |
# OUTPUT | |
#------------------------------------------------------------------------------- | |
# | |
# Trained Q matrix: | |
#[[ 0. 0. 0. 0. 80. 0. ] | |
# [ 0. 0. 0. 64. 0. 100. ] | |
# [ 0. 0. 0. 64. 0. 0. ] | |
# [ 0. 80. 51.2 0. 80. 0. ] | |
# [ 0. 80. 51.2 0. 0. 100. ] | |
# [ 0. 80. 0. 0. 80. 100. ]] | |
# | |
# Selected path: | |
# [2, 3, 1, 5] | |
# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment