Last active
December 31, 2024 22:36
-
-
Save barron9/62986d96aa00b1688f84e07bc7224d89 to your computer and use it in GitHub Desktop.
q solution for inverted pendulum
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gymnasium as gym | |
import numpy as np | |
from tqdm import tqdm | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import pickle | |
import os | |
env_train = gym.make('InvertedPendulum-v5') | |
# Define the dimensions of your state space | |
dim1 = 5 | |
dim2 = 60 | |
dim3 = 5 | |
dim4 = 5 | |
a_dim = 20 # action space dimensions | |
# Bins information | |
position_bins = np.linspace(-1.4, 1.4, dim1) # 10 bins for position of cart | |
angle_bins = np.linspace(-3.4, 3.4, dim2) # 30 bins for angle of pole | |
velocity_bins = np.linspace(-2.3, 2.3, dim3) # 10 bins for velocity of cart | |
angular_velocity_bins = np.linspace(-2.0, 2.0, dim4) # 20 bins for angular velocity of pole | |
force_n_bins = np.linspace(-3, 3, a_dim) # 10 bins for force applying on motor | |
def discretize_action(action): | |
# Bins information | |
f = action | |
f_index = np.digitize(f, force_n_bins) | |
f_index = np.clip(f_index, 0, len(force_n_bins) - 1) | |
return (f_index) | |
def discretize_observation(obs): | |
# Discretizing each observation | |
x, y, z, a = obs | |
# Map each value to its corresponding bin index | |
# For position of the cart (x), map it to bin index between 0 and 9 | |
x_index = np.digitize(x, position_bins) - 1 # Subtract 1 because np.digitize returns 1-based index | |
x_index = np.clip(x_index, 0, len(position_bins) - 1) # Ensure the value is within range | |
# For vertical angle of the pole (y), map it to bin index between 0 and 29 | |
y_index = np.digitize(y, angle_bins) - 1 | |
y_index = np.clip(y_index, 0, len(angle_bins) - 1) | |
# For linear velocity of the cart (z), map it to bin index between 0 and 9 | |
z_index = np.digitize(z, velocity_bins) - 1 | |
z_index = np.clip(z_index, 0, len(velocity_bins) - 1) | |
# For angular velocity of the pole (a), map it to bin index between 0 and 19 | |
a_index = np.digitize(a, angular_velocity_bins) - 1 | |
a_index = np.clip(a_index, 0, len(angular_velocity_bins) - 1) | |
# Return the discretized observation as a tuple of bin indices | |
return (x_index, y_index, z_index, a_index) | |
# file_path = 'qtable.npy' | |
# # Check if the file exists | |
# if os.path.exists(file_path): | |
# # Load with NumPy | |
# Q = np.load('qtable.npy') | |
# print("Q-table loaded successfully.") | |
# else: | |
# # initialize Q table | |
Q = np.zeros((dim1*dim2*dim3*dim4,a_dim)) | |
# parameter | |
lr = 0.1 | |
gamma = 0.95 | |
num_episodes = 20001 | |
# Track rewards per episode | |
episode_rewards = [] | |
train = True | |
if train: | |
# learning | |
for i in tqdm( | |
np.arange(num_episodes), desc=f"Run Episodes", leave=False | |
): | |
totalrew = 0 | |
state = env_train.reset()[0] | |
done = False | |
while not done: | |
state_discr = discretize_observation(state) | |
# Flatten the state into an index | |
state_index = state_discr[0] * (dim2 * dim3 * dim4) + state_discr[1] * (dim3 * dim4) + state_discr[2] * dim4 + state_discr[3] | |
# epsilon-greedy policy | |
if np.random.uniform(0, 1) < 0.45 - i/110000: | |
action = env_train.action_space.sample() | |
next_state, reward, done, _, _ = env_train.step(action) | |
action = discretize_action(action) | |
else: | |
action = np.argmax(Q[int(state_index),:]) | |
action1 = force_n_bins[action] # convert bins relative ,index from q table, maxarg | |
next_state, reward, done, _, _ = env_train.step([action1]) | |
next_state_disc = discretize_observation(next_state) | |
next_state_index = next_state_disc[0] * (dim2 * dim3 * dim4) + next_state_disc[1] * (dim3 * dim4) + next_state_disc[2] * dim4 + next_state_disc[3] | |
temp_diffrence = reward + gamma * np.max(Q[int(next_state_index),:]) - Q[int(state_index), action] | |
Q[int(state_index), action] = Q[int(state_index), action] + lr * (temp_diffrence) | |
state = next_state | |
#state_discr = next_state_disc | |
totalrew += reward | |
episode_rewards.append(totalrew) | |
# Saving Q-table to a file | |
np.save('qtable.npy', Q) | |
# Plotting the reward per episode | |
plt.plot(range((num_episodes)),episode_rewards, color='blue', linewidth=1) | |
plt.xlabel('Episode') | |
plt.ylabel('Reward') | |
plt.title('Reward per Episode') | |
plt.show() | |
env_train.close() | |
# Load with NumPy | |
#Q = np.load('qtable.npy') | |
env_test = gym.make('InvertedPendulum-v5', render_mode="human") | |
episode_rewards = [] | |
for i in range(60): | |
state = env_test.reset()[0] | |
done = False | |
totalreward = 0 | |
while not done: | |
# Flatten the state into an index | |
state_discr = discretize_observation(state) | |
state_index = state_discr[0] * (dim2 * dim3 * dim4) + state_discr[1] * (dim3 * dim4) + state_discr[2] * dim4 + state_discr[3] | |
action = np.argmax(Q[int(state_index), :]) | |
action = force_n_bins[action] # convert bins relative ,index from q table, maxarg | |
next_state, reward, done, _, _ = env_test.step([action]) | |
state = next_state | |
env_test.render() | |
totalreward += reward | |
#print(totalreward) | |
episode_rewards.append(totalreward) | |
env_test.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment