Skip to content

Instantly share code, notes, and snippets.

@barron9
Last active December 31, 2024 22:36
Show Gist options
  • Save barron9/62986d96aa00b1688f84e07bc7224d89 to your computer and use it in GitHub Desktop.
Save barron9/62986d96aa00b1688f84e07bc7224d89 to your computer and use it in GitHub Desktop.
q solution for inverted pendulum
import gymnasium as gym
import numpy as np
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
env_train = gym.make('InvertedPendulum-v5')
# Define the dimensions of your state space
dim1 = 5
dim2 = 60
dim3 = 5
dim4 = 5
a_dim = 20 # action space dimensions
# Bins information
position_bins = np.linspace(-1.4, 1.4, dim1) # 10 bins for position of cart
angle_bins = np.linspace(-3.4, 3.4, dim2) # 30 bins for angle of pole
velocity_bins = np.linspace(-2.3, 2.3, dim3) # 10 bins for velocity of cart
angular_velocity_bins = np.linspace(-2.0, 2.0, dim4) # 20 bins for angular velocity of pole
force_n_bins = np.linspace(-3, 3, a_dim) # 10 bins for force applying on motor
def discretize_action(action):
# Bins information
f = action
f_index = np.digitize(f, force_n_bins)
f_index = np.clip(f_index, 0, len(force_n_bins) - 1)
return (f_index)
def discretize_observation(obs):
# Discretizing each observation
x, y, z, a = obs
# Map each value to its corresponding bin index
# For position of the cart (x), map it to bin index between 0 and 9
x_index = np.digitize(x, position_bins) - 1 # Subtract 1 because np.digitize returns 1-based index
x_index = np.clip(x_index, 0, len(position_bins) - 1) # Ensure the value is within range
# For vertical angle of the pole (y), map it to bin index between 0 and 29
y_index = np.digitize(y, angle_bins) - 1
y_index = np.clip(y_index, 0, len(angle_bins) - 1)
# For linear velocity of the cart (z), map it to bin index between 0 and 9
z_index = np.digitize(z, velocity_bins) - 1
z_index = np.clip(z_index, 0, len(velocity_bins) - 1)
# For angular velocity of the pole (a), map it to bin index between 0 and 19
a_index = np.digitize(a, angular_velocity_bins) - 1
a_index = np.clip(a_index, 0, len(angular_velocity_bins) - 1)
# Return the discretized observation as a tuple of bin indices
return (x_index, y_index, z_index, a_index)
# file_path = 'qtable.npy'
# # Check if the file exists
# if os.path.exists(file_path):
# # Load with NumPy
# Q = np.load('qtable.npy')
# print("Q-table loaded successfully.")
# else:
# # initialize Q table
Q = np.zeros((dim1*dim2*dim3*dim4,a_dim))
# parameter
lr = 0.1
gamma = 0.95
num_episodes = 20001
# Track rewards per episode
episode_rewards = []
train = True
if train:
# learning
for i in tqdm(
np.arange(num_episodes), desc=f"Run Episodes", leave=False
):
totalrew = 0
state = env_train.reset()[0]
done = False
while not done:
state_discr = discretize_observation(state)
# Flatten the state into an index
state_index = state_discr[0] * (dim2 * dim3 * dim4) + state_discr[1] * (dim3 * dim4) + state_discr[2] * dim4 + state_discr[3]
# epsilon-greedy policy
if np.random.uniform(0, 1) < 0.45 - i/110000:
action = env_train.action_space.sample()
next_state, reward, done, _, _ = env_train.step(action)
action = discretize_action(action)
else:
action = np.argmax(Q[int(state_index),:])
action1 = force_n_bins[action] # convert bins relative ,index from q table, maxarg
next_state, reward, done, _, _ = env_train.step([action1])
next_state_disc = discretize_observation(next_state)
next_state_index = next_state_disc[0] * (dim2 * dim3 * dim4) + next_state_disc[1] * (dim3 * dim4) + next_state_disc[2] * dim4 + next_state_disc[3]
temp_diffrence = reward + gamma * np.max(Q[int(next_state_index),:]) - Q[int(state_index), action]
Q[int(state_index), action] = Q[int(state_index), action] + lr * (temp_diffrence)
state = next_state
#state_discr = next_state_disc
totalrew += reward
episode_rewards.append(totalrew)
# Saving Q-table to a file
np.save('qtable.npy', Q)
# Plotting the reward per episode
plt.plot(range((num_episodes)),episode_rewards, color='blue', linewidth=1)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Reward per Episode')
plt.show()
env_train.close()
# Load with NumPy
#Q = np.load('qtable.npy')
env_test = gym.make('InvertedPendulum-v5', render_mode="human")
episode_rewards = []
for i in range(60):
state = env_test.reset()[0]
done = False
totalreward = 0
while not done:
# Flatten the state into an index
state_discr = discretize_observation(state)
state_index = state_discr[0] * (dim2 * dim3 * dim4) + state_discr[1] * (dim3 * dim4) + state_discr[2] * dim4 + state_discr[3]
action = np.argmax(Q[int(state_index), :])
action = force_n_bins[action] # convert bins relative ,index from q table, maxarg
next_state, reward, done, _, _ = env_test.step([action])
state = next_state
env_test.render()
totalreward += reward
#print(totalreward)
episode_rewards.append(totalreward)
env_test.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment