Skip to content

Instantly share code, notes, and snippets.

@B-R-P
Created November 24, 2023 19:07
Show Gist options
  • Save B-R-P/89db51ca89a5170a88b107bce15c76f9 to your computer and use it in GitHub Desktop.
Save B-R-P/89db51ca89a5170a88b107bce15c76f9 to your computer and use it in GitHub Desktop.
Q-Star Reinforcement Learning for Grid Navigation
import numpy as np
# Define constants for the grid world
GRID_SIZE = 5
NUM_ACTIONS = 4
NUM_EPISODES = 500
MAX_STEPS_PER_EPISODE = 50
# Define states (representing positions on the grid)
STATES = [(i, j) for i in range(GRID_SIZE) for j in range(GRID_SIZE)]
# Define actions (representing possible movements: up, down, left, right)
ACTIONS = [0, 1, 2, 3] # 0: up, 1: down, 2: left, 3: right
# Initialize Q-values for each state-action pair
initial_q_values = np.zeros((GRID_SIZE, GRID_SIZE, NUM_ACTIONS))
# Function to get the reward for a given state-action pair
def get_reward(state, action):
next_state = get_next_state(state, action)
# Define a positive reward if the agent reaches the goal, negative for obstacles, and zero otherwise
if next_state == (GRID_SIZE - 1, GRID_SIZE - 1): # Goal state
return 10
elif next_state in [(1, 1), (2, 2), (3, 3)]: # Obstacle states
return -5
else:
return 0
# Function to get the next state for a given state-action pair
def get_next_state(state, action):
i, j = state
if action == 0: # up
return max(i - 1, 0), j
elif action == 1: # down
return min(i + 1, GRID_SIZE - 1), j
elif action == 2: # left
return i, max(j - 1, 0)
elif action == 3: # right
return i, min(j + 1, GRID_SIZE - 1)
# Function to check if a state is a terminal state
def is_terminal_state(state):
return state == (GRID_SIZE - 1, GRID_SIZE - 1) # Goal state
# Function to check if the maximum number of steps is reached
def max_steps_reached():
global step_count
step_count += 1
return step_count >= MAX_STEPS_PER_EPISODE
# Q-star Reinforcement Learning
def q_star_reinforcement_learning(S, A, initial_q_values, alpha, gamma, epsilon, max_episodes):
Q = np.copy(initial_q_values)
for episode in range(1, max_episodes + 1):
global step_count
step_count = 0 # Reset step count for each episode
current_state = (0, 0) # Start from the top-left corner
while True:
if np.random.rand() < epsilon:
current_action = np.random.choice(A)
else:
current_action = np.argmax(Q[current_state[0], current_state[1], :] + np.random.randn(1, NUM_ACTIONS) * (1.0 / (episode + 1)))
reward = get_reward(current_state, current_action)
next_state = get_next_state(current_state, current_action)
Q[current_state[0], current_state[1], current_action] += alpha * (
reward + gamma * np.max(Q[next_state[0], next_state[1], :]) - Q[current_state[0], current_state[1], current_action]
)
current_state = next_state
if is_terminal_state(current_state) or max_steps_reached():
break
return Q # Return the trained Q-values
# Main program
step_count = 0 # Reset step count for each episode
trained_q_values = q_star_reinforcement_learning(STATES, ACTIONS, initial_q_values, alpha=0.1, gamma=0.9, epsilon=0.2, max_episodes=NUM_EPISODES)
# Use the trained Q-values to navigate through the grid world
current_state = (0, 0)
while not is_terminal_state(current_state):
action = np.argmax(trained_q_values[current_state[0], current_state[1], :])
print(f"Move {action}: {current_state} -> {get_next_state(current_state, action)}")
current_state = get_next_state(current_state, action)
@B-R-P
Copy link
Author

B-R-P commented Nov 26, 2023

Speculation

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment