Taresin/Taxi-v3 (OpenAI) - Value Iteration.py

## Taxi-v3 (OpenAI) - Value Iteration.py
# ==============================================================================
# Title: Taxi-V3 OpenAI Gym Environment Update
# Author: Taresin
# Date: 26/02/2024
#
# Description: This code is an update to a previously existing Gist for OpenAI Gym
# environments. It incorporates fixes the functionality
# with compatibility to the new Gymnasium library. The update is based on the original work found in Aske Plaat's
# book "Deep Reinforcement Learning" and the original code by Geffy available at
# https://gist.github.com/geffy/b2d16d01cbca1ae9e13f11f678fa96fd
#
# Blog Explanation: [https://nibblenibblebyte.notion.site/Week-1-Value-Iteration-83f93ea6f75247d4ae459e7582596c90]
# This blog post explains the workings of the updated code.
#
# License: MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# ==============================================================================

import gymnasium as gym
import numpy as np


# Parameters
n_rounds = 5
steps_limit = 1000


# HyperParameters
gamma = 0.9
convergence_threshold = 0.1


# This is one iteration of the value iteration function
def iterate_value_function(state_values_current, gamma, env):
    ns = env.observation_space.n # number of states
    na = env.action_space.n # number of actions

    state_values_new = np.zeros(ns)
    for state_id in range(ns):

        # Remember the value of each action for this state
        # We will pick the highest value
        action_values = np.zeros(na)
        for action in range(na):

            # env.env.unwrapped.P gives us a peek into what the result of the action gives us.
            # For Taxi-v3 this will always be a single next state.
            # Other problems might have possible other states if there's randomness.
            for (prob, dst_state, reward, is_final) in env.env.unwrapped.P[state_id][action]:
                action_values[action] = prob * (reward + gamma * state_values_current[dst_state] * (not is_final))
        state_values_new[state_id] = max(action_values)
    return state_values_new


# Once the State Values are finalised, create policy based on the highest value action
# This seems redundant since it can be done in the iteration, but I'm keeping the same
# architecture as the original
def build_greedy_policy(state_values, gamma, env):
    ns = env.observation_space.n # number of states
    na = env.action_space.n # number of actions

    policy = np.zeros(ns)
    for state_id in range(ns):

        # Remember
        action_values = np.zeros(na)
        for action in range(na):
            for (prob, dst_state, reward, is_final) in env.env.unwrapped.P[state_id][action]:
                action_values[action] = prob * (reward + gamma * v[dst_state])

        # Returns the indices of the maximum values along an axis
        # https://numpy.org/doc/stable/reference/generated/numpy.argmax.html
        policy[state_id] = np.argmax(action_values)

    # Actions are whole numbers so we don't want policy to return as a float
    return policy.astype(int)


# Solve for State Value and Policy
env = gym.make("Taxi-v3", render_mode='human')
ns = env.observation_space.n
na = env.action_space.n

# List of all state values (how much value the state is worth)
# Initialise with zeroes to start with.
# We update these for each iteration
v = np.zeros(ns)

should_iterate = True
while should_iterate:
    # Use a copy to check for when convergence occurs
    v_old = v.copy()
    v = iterate_value_function(v, gamma, env)
    should_iterate = not np.all(abs(v - v_old) < convergence_threshold)

print(f'State Values:\n{v}')

policy = build_greedy_policy(v, gamma, env)
print(f'Policy:\n{policy}')

# Run the problem using our policy
cum_reward = 0 # Stores this to figure out the average reward
for round_count in range(n_rounds):
    state, info = env.reset()

    total_reward = 0
    for step in range(1, steps_limit + 1):

        # Get action from the calculated policy
        action = policy[state]

        # Apply action to the environment
        state, reward, terminated, truncated, info = env.step(action)

        # Calculate total reward
        total_reward += reward

        # Provide feedback on termination
        if terminated:
            print(f'Terminated in {step} steps with reward: {reward}')
            break
        if step == steps_limit:
            print("Did not terminate")

    # Every 50 steps, print out the current average reward
    if round_count % 50 == 0 and round_count > 0:
        average_reward = cum_reward * 1.0 / (round_count + 1)
        print(f'Average reward: {average_reward}')

env.close()
	# ==============================================================================
	# Title: Taxi-V3 OpenAI Gym Environment Update
	# Author: Taresin
	# Date: 26/02/2024
	#
	# Description: This code is an update to a previously existing Gist for OpenAI Gym
	# environments. It incorporates fixes the functionality
	# with compatibility to the new Gymnasium library. The update is based on the original work found in Aske Plaat's
	# book "Deep Reinforcement Learning" and the original code by Geffy available at
	# https://gist.github.com/geffy/b2d16d01cbca1ae9e13f11f678fa96fd
	#
	# Blog Explanation: [https://nibblenibblebyte.notion.site/Week-1-Value-Iteration-83f93ea6f75247d4ae459e7582596c90]
	# This blog post explains the workings of the updated code.
	#
	# License: MIT License
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.
	# ==============================================================================

	import gymnasium as gym
	import numpy as np


	# Parameters
	n_rounds = 5
	steps_limit = 1000


	# HyperParameters
	gamma = 0.9
	convergence_threshold = 0.1


	# This is one iteration of the value iteration function
	def iterate_value_function(state_values_current, gamma, env):
	ns = env.observation_space.n # number of states
	na = env.action_space.n # number of actions

	state_values_new = np.zeros(ns)
	for state_id in range(ns):

	# Remember the value of each action for this state
	# We will pick the highest value
	action_values = np.zeros(na)
	for action in range(na):

	# env.env.unwrapped.P gives us a peek into what the result of the action gives us.
	# For Taxi-v3 this will always be a single next state.
	# Other problems might have possible other states if there's randomness.
	for (prob, dst_state, reward, is_final) in env.env.unwrapped.P[state_id][action]:
	action_values[action] = prob * (reward + gamma * state_values_current[dst_state] * (not is_final))
	state_values_new[state_id] = max(action_values)
	return state_values_new


	# Once the State Values are finalised, create policy based on the highest value action
	# This seems redundant since it can be done in the iteration, but I'm keeping the same
	# architecture as the original
	def build_greedy_policy(state_values, gamma, env):
	ns = env.observation_space.n # number of states
	na = env.action_space.n # number of actions

	policy = np.zeros(ns)
	for state_id in range(ns):

	# Remember
	action_values = np.zeros(na)
	for action in range(na):
	for (prob, dst_state, reward, is_final) in env.env.unwrapped.P[state_id][action]:
	action_values[action] = prob * (reward + gamma * v[dst_state])

	# Returns the indices of the maximum values along an axis
	# https://numpy.org/doc/stable/reference/generated/numpy.argmax.html
	policy[state_id] = np.argmax(action_values)

	# Actions are whole numbers so we don't want policy to return as a float
	return policy.astype(int)


	# Solve for State Value and Policy
	env = gym.make("Taxi-v3", render_mode='human')
	ns = env.observation_space.n
	na = env.action_space.n

	# List of all state values (how much value the state is worth)
	# Initialise with zeroes to start with.
	# We update these for each iteration
	v = np.zeros(ns)

	should_iterate = True
	while should_iterate:
	# Use a copy to check for when convergence occurs
	v_old = v.copy()
	v = iterate_value_function(v, gamma, env)
	should_iterate = not np.all(abs(v - v_old) < convergence_threshold)

	print(f'State Values:\n{v}')

	policy = build_greedy_policy(v, gamma, env)
	print(f'Policy:\n{policy}')

	# Run the problem using our policy
	cum_reward = 0 # Stores this to figure out the average reward
	for round_count in range(n_rounds):
	state, info = env.reset()

	total_reward = 0
	for step in range(1, steps_limit + 1):

	# Get action from the calculated policy
	action = policy[state]

	# Apply action to the environment
	state, reward, terminated, truncated, info = env.step(action)

	# Calculate total reward
	total_reward += reward

	# Provide feedback on termination
	if terminated:
	print(f'Terminated in {step} steps with reward: {reward}')
	break
	if step == steps_limit:
	print("Did not terminate")

	# Every 50 steps, print out the current average reward
	if round_count % 50 == 0 and round_count > 0:
	average_reward = cum_reward * 1.0 / (round_count + 1)
	print(f'Average reward: {average_reward}')

	env.close()