Skip to content

Instantly share code, notes, and snippets.

@gkhayes
Created February 22, 2019 08:09
Show Gist options
  • Star 14 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save gkhayes/3d154e0505e31d6367be22ed3da2e955 to your computer and use it in GitHub Desktop.
Save gkhayes/3d154e0505e31d6367be22ed3da2e955 to your computer and use it in GitHub Desktop.
Use Q-learning to solve the OpenAI Gym Mountain Car problem
import numpy as np
import gym
import matplotlib.pyplot as plt
# Import and initialize Mountain Car Environment
env = gym.make('MountainCar-v0')
env.reset()
# Define Q-learning function
def QLearning(env, learning, discount, epsilon, min_eps, episodes):
# Determine size of discretized state space
num_states = (env.observation_space.high - env.observation_space.low)*\
np.array([10, 100])
num_states = np.round(num_states, 0).astype(int) + 1
# Initialize Q table
Q = np.random.uniform(low = -1, high = 1,
size = (num_states[0], num_states[1],
env.action_space.n))
# Initialize variables to track rewards
reward_list = []
ave_reward_list = []
# Calculate episodic reduction in epsilon
reduction = (epsilon - min_eps)/episodes
# Run Q learning algorithm
for i in range(episodes):
# Initialize parameters
done = False
tot_reward, reward = 0,0
state = env.reset()
# Discretize state
state_adj = (state - env.observation_space.low)*np.array([10, 100])
state_adj = np.round(state_adj, 0).astype(int)
while done != True:
# Render environment for last five episodes
if i >= (episodes - 20):
env.render()
# Determine next action - epsilon greedy strategy
if np.random.random() < 1 - epsilon:
action = np.argmax(Q[state_adj[0], state_adj[1]])
else:
action = np.random.randint(0, env.action_space.n)
# Get next state and reward
state2, reward, done, info = env.step(action)
# Discretize state2
state2_adj = (state2 - env.observation_space.low)*np.array([10, 100])
state2_adj = np.round(state2_adj, 0).astype(int)
#Allow for terminal states
if done and state2[0] >= 0.5:
Q[state_adj[0], state_adj[1], action] = reward
# Adjust Q value for current state
else:
delta = learning*(reward +
discount*np.max(Q[state2_adj[0],
state2_adj[1]]) -
Q[state_adj[0], state_adj[1],action])
Q[state_adj[0], state_adj[1],action] += delta
# Update variables
tot_reward += reward
state_adj = state2_adj
# Decay epsilon
if epsilon > min_eps:
epsilon -= reduction
# Track rewards
reward_list.append(tot_reward)
if (i+1) % 100 == 0:
ave_reward = np.mean(reward_list)
ave_reward_list.append(ave_reward)
reward_list = []
if (i+1) % 100 == 0:
print('Episode {} Average Reward: {}'.format(i+1, ave_reward))
env.close()
return ave_reward_list
# Run Q-learning algorithm
rewards = QLearning(env, 0.2, 0.9, 0.8, 0, 5000)
# Plot Rewards
plt.plot(100*(np.arange(len(rewards)) + 1), rewards)
plt.xlabel('Episodes')
plt.ylabel('Average Reward')
plt.title('Average Reward vs Episodes')
plt.savefig('rewards.jpg')
plt.close()
@myccmj
Copy link

myccmj commented Dec 1, 2022

TypeError: unsupported operand type(s) for -: 'dict' and 'float'

@luckyluckycool
Copy link

luckyluckycool commented May 7, 2023

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.

in line 36:
state_adj = (state - env.observation_space.low)*np.array([10, 100])

@SeeenyaOhar
Copy link

SeeenyaOhar commented Apr 11, 2024


# %%
import gymnasium as gym
import numpy as np
import imageio

env = gym.make("MountainCar-v0")
env.reset()

LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 1200

SHOW_EVERY = 200

DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)

discrete_os_win_size = (env.observation_space.high - env.observation_space.low) / DISCRETE_OS_SIZE
q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))


def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low) / discrete_os_win_size
    return tuple(discrete_state.astype(np.int32))


for episode in range(EPISODES):
    discrete_state = get_discrete_state(env.reset()[0])
    frames = []
    done = False
    while not done:
        action = np.argmax(q_table[discrete_state])
        new_state, reward, truncated, terminated, _ = env.step(action)
        done = truncated or terminated
        new_discrete_state = get_discrete_state(new_state)

        if not done:
            max_future_q = np.max(q_table[new_discrete_state])
            current_q = q_table[discrete_state + (action, )]

            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)
            q_table[discrete_state + (action, )] = new_q
        elif new_state[0] >= env.goal_position:
            print(f"Congratulation! We reached to the goal! Episode: {episode}")
            q_table[discrete_state + (action, )] = 0

        discrete_state = new_discrete_state

env.close()

# %%
env = gym.make("MountainCar-v0", render_mode='human')

env.reset()

DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low) / DISCRETE_OS_SIZE
discrete_state = get_discrete_state(env.reset()[0])

done = False
while not done:
    discrete_state = get_discrete_state(env.state)
    action = np.argmax(q_table[discrete_state])
    new_state, _, done, _, _ = env.step(action)

env.close()


# %%
env.close()



Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment