B-R-P/q-star.py

## q-star.py
import numpy as np

# Define constants for the grid world
GRID_SIZE = 5
NUM_ACTIONS = 4
NUM_EPISODES = 500
MAX_STEPS_PER_EPISODE = 50

# Define states (representing positions on the grid)
STATES = [(i, j) for i in range(GRID_SIZE) for j in range(GRID_SIZE)]

# Define actions (representing possible movements: up, down, left, right)
ACTIONS = [0, 1, 2, 3]  # 0: up, 1: down, 2: left, 3: right

# Initialize Q-values for each state-action pair
initial_q_values = np.zeros((GRID_SIZE, GRID_SIZE, NUM_ACTIONS))

# Function to get the reward for a given state-action pair
def get_reward(state, action):
    next_state = get_next_state(state, action)
    # Define a positive reward if the agent reaches the goal, negative for obstacles, and zero otherwise
    if next_state == (GRID_SIZE - 1, GRID_SIZE - 1):  # Goal state
        return 10
    elif next_state in [(1, 1), (2, 2), (3, 3)]:  # Obstacle states
        return -5
    else:
        return 0

# Function to get the next state for a given state-action pair
def get_next_state(state, action):
    i, j = state
    if action == 0:  # up
        return max(i - 1, 0), j
    elif action == 1:  # down
        return min(i + 1, GRID_SIZE - 1), j
    elif action == 2:  # left
        return i, max(j - 1, 0)
    elif action == 3:  # right
        return i, min(j + 1, GRID_SIZE - 1)

# Function to check if a state is a terminal state
def is_terminal_state(state):
    return state == (GRID_SIZE - 1, GRID_SIZE - 1)  # Goal state

# Function to check if the maximum number of steps is reached
def max_steps_reached():
    global step_count
    step_count += 1
    return step_count >= MAX_STEPS_PER_EPISODE

# Q-star Reinforcement Learning
def q_star_reinforcement_learning(S, A, initial_q_values, alpha, gamma, epsilon, max_episodes):
    Q = np.copy(initial_q_values)
    for episode in range(1, max_episodes + 1):
        global step_count
        step_count = 0  # Reset step count for each episode
        current_state = (0, 0)  # Start from the top-left corner
        while True:
            if np.random.rand() < epsilon:
                current_action = np.random.choice(A)
            else:
                current_action = np.argmax(Q[current_state[0], current_state[1], :] + np.random.randn(1, NUM_ACTIONS) * (1.0 / (episode + 1)))
            reward = get_reward(current_state, current_action)
            next_state = get_next_state(current_state, current_action)
            Q[current_state[0], current_state[1], current_action] += alpha * (
                reward + gamma * np.max(Q[next_state[0], next_state[1], :]) - Q[current_state[0], current_state[1], current_action]
            )
            current_state = next_state
            if is_terminal_state(current_state) or max_steps_reached():
                break

    return Q  # Return the trained Q-values

# Main program
step_count = 0  # Reset step count for each episode
trained_q_values = q_star_reinforcement_learning(STATES, ACTIONS, initial_q_values, alpha=0.1, gamma=0.9, epsilon=0.2, max_episodes=NUM_EPISODES)

# Use the trained Q-values to navigate through the grid world
current_state = (0, 0)
while not is_terminal_state(current_state):
    action = np.argmax(trained_q_values[current_state[0], current_state[1], :])
    print(f"Move {action}: {current_state} -> {get_next_state(current_state, action)}")
    current_state = get_next_state(current_state, action)
	import numpy as np

	# Define constants for the grid world
	GRID_SIZE = 5
	NUM_ACTIONS = 4
	NUM_EPISODES = 500
	MAX_STEPS_PER_EPISODE = 50

	# Define states (representing positions on the grid)
	STATES = [(i, j) for i in range(GRID_SIZE) for j in range(GRID_SIZE)]

	# Define actions (representing possible movements: up, down, left, right)
	ACTIONS = [0, 1, 2, 3] # 0: up, 1: down, 2: left, 3: right

	# Initialize Q-values for each state-action pair
	initial_q_values = np.zeros((GRID_SIZE, GRID_SIZE, NUM_ACTIONS))

	# Function to get the reward for a given state-action pair
	def get_reward(state, action):
	next_state = get_next_state(state, action)
	# Define a positive reward if the agent reaches the goal, negative for obstacles, and zero otherwise
	if next_state == (GRID_SIZE - 1, GRID_SIZE - 1): # Goal state
	return 10
	elif next_state in [(1, 1), (2, 2), (3, 3)]: # Obstacle states
	return -5
	else:
	return 0

	# Function to get the next state for a given state-action pair
	def get_next_state(state, action):
	i, j = state
	if action == 0: # up
	return max(i - 1, 0), j
	elif action == 1: # down
	return min(i + 1, GRID_SIZE - 1), j
	elif action == 2: # left
	return i, max(j - 1, 0)
	elif action == 3: # right
	return i, min(j + 1, GRID_SIZE - 1)

	# Function to check if a state is a terminal state
	def is_terminal_state(state):
	return state == (GRID_SIZE - 1, GRID_SIZE - 1) # Goal state

	# Function to check if the maximum number of steps is reached
	def max_steps_reached():
	global step_count
	step_count += 1
	return step_count >= MAX_STEPS_PER_EPISODE

	# Q-star Reinforcement Learning
	def q_star_reinforcement_learning(S, A, initial_q_values, alpha, gamma, epsilon, max_episodes):
	Q = np.copy(initial_q_values)
	for episode in range(1, max_episodes + 1):
	global step_count
	step_count = 0 # Reset step count for each episode
	current_state = (0, 0) # Start from the top-left corner
	while True:
	if np.random.rand() < epsilon:
	current_action = np.random.choice(A)
	else:
	current_action = np.argmax(Q[current_state[0], current_state[1], :] + np.random.randn(1, NUM_ACTIONS) * (1.0 / (episode + 1)))
	reward = get_reward(current_state, current_action)
	next_state = get_next_state(current_state, current_action)
	Q[current_state[0], current_state[1], current_action] += alpha * (
	reward + gamma * np.max(Q[next_state[0], next_state[1], :]) - Q[current_state[0], current_state[1], current_action]
	)
	current_state = next_state
	if is_terminal_state(current_state) or max_steps_reached():
	break

	return Q # Return the trained Q-values

	# Main program
	step_count = 0 # Reset step count for each episode
	trained_q_values = q_star_reinforcement_learning(STATES, ACTIONS, initial_q_values, alpha=0.1, gamma=0.9, epsilon=0.2, max_episodes=NUM_EPISODES)

	# Use the trained Q-values to navigate through the grid world
	current_state = (0, 0)
	while not is_terminal_state(current_state):
	action = np.argmax(trained_q_values[current_state[0], current_state[1], :])
	print(f"Move {action}: {current_state} -> {get_next_state(current_state, action)}")
	current_state = get_next_state(current_state, action)