Skip to content

Instantly share code, notes, and snippets.

@giuseppebonaccorso
Last active March 3, 2023 06:13
  • Star 6 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save giuseppebonaccorso/7040b10a13520c4b0340b8a89dc8262f to your computer and use it in GitHub Desktop.
OpenAI Gym Cartpole-v0 LSTM experiment
This experiment is related to: https://gym.openai.com/envs/CartPole-v0
It's based on a Keras LSTM network (with Theano/GPU support) trained with "semi"-reinforcement approach. In this version, the overall performance is always greater than 500 (with common episodes that can overcome 2000). However I'm still investigating how to remove some spourius oscillations that can avoid convergence after a few episodes.
@GiuseppeB
http://www.bonaccorso.eu
'''
OpenAI-Gym Cartpole-v0 LSTM experiment
Giuseppe Bonaccorso (http://www.bonaccorso.eu)
'''
import gym
import numpy as np
import time
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM
from keras import backend as K
# Result location
result_location = 'D://ml_data//cartpole-lstm-1'
# Number of episodes
nb_episodes = 100
# Max execution time (in seconds)
max_execution_time = 120
# Set random seed
np.random.seed(1000)
class CartPoleController(object):
def __init__(self, n_input=4, n_hidden=10, n_output=1, initial_state=0.1, training_threshold=1.5):
self.n_input = n_input
self.n_hidden = n_hidden
self.n_output = n_output
self.initial_state = initial_state
self.training_threshold = training_threshold
self.step_threshold = 0.5
# Action neural network
# Dense input -> (1 x n_input)
# LSTM -> (n_hidden)
# Dense output -> (n_output)
self.action_model = Sequential()
self.action_model.add(LSTM(self.n_hidden, input_shape=(1, self.n_input)))
self.action_model.add(Activation('tanh'))
self.action_model.add(Dense(self.n_output))
self.action_model.add(Activation('sigmoid'))
self.action_model.compile(loss='mse', optimizer='adam')
def action(self, obs, prev_obs=None, prev_action=None):
x = np.ndarray(shape=(1, 1, self.n_input)).astype(K.floatx())
if prev_obs is not None:
prev_norm = np.linalg.norm(prev_obs)
if prev_norm > self.training_threshold:
# Compute a training step
x[0, 0, :] = prev_obs
if prev_norm < self.step_threshold:
y = np.array([prev_action]).astype(K.floatx())
else:
y = np.array([np.abs(prev_action - 1)]).astype(K.floatx())
self.action_model.train_on_batch(x, y)
# Predict new value
x[0, 0, :] = obs
output = self.action_model.predict(x, batch_size=1)
return self.step(output)
def step(self, value):
if value > self.step_threshold:
return int(1)
else:
return int(0)
if __name__ == '__main__':
print('OpenAI-Gym CartPole-v0 LSTM experiment')
env = gym.make('CartPole-v0')
env.monitor.start(result_location, force=True)
cart_pole_controller = CartPoleController()
total_reward = []
for episode in range(nb_episodes):
# Reset environment
observation = env.reset()
previous_observation = observation
action = cart_pole_controller.action(observation)
previous_action = action
done = False
t = 0
partial_reward = 0.0
start_time = time.time()
elapsed_time = 0
while not done and elapsed_time < max_execution_time:
t += 1
elapsed_time = time.time() - start_time
env.render()
observation, reward, done, info = env.step(action)
partial_reward += reward
action = cart_pole_controller.action(observation, previous_observation, previous_action)
previous_observation = observation
previous_action = action
print('Episode %d finished after %d timesteps. Total reward: %1.0f. Elapsed time: %d s' %
(episode+1, t+1, partial_reward, elapsed_time))
total_reward.append(partial_reward)
env.monitor.close()
total_reward = np.array(total_reward)
print('Average reward: %3.2f' % np.mean(total_reward))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment