Last active December 7, 2016 18:05
OpenAI gym

This is the attempt to create a generic (hence the relatively long code) agent for different openAI gym environments. The model is based on Q-learning with experience replay. Collected Q-values are approximated by neural network (tensorflow). The action with the maximum Q-value for the given state is selected. Exploration rate starts at 0.6 and is quickly annealed to the standard 0.1 value. The neural network used for the cartpole environment is quite simple with one RELU hidden layer and linear activation on the output layer. The model is loosely based on excellent tutorial written by Tambet Matiisen in his blog.

The main challenge I experienced when adapting this agent to the cartpole environment was to select the proper reward model. The default award of +1 for every moment when the pole was upright was not very successful. Instead I assigned 0. award for every moment the pole was upright and penalized (-1.0) the final move before the episode was lost. +1.0 award was assigned to the final move if the agent achieved the winning score of 200 in the episode.

import gym
import re
import tensorflow as tf
import numpy as np
import shutil
class ExperienceQModel(object):
def __init__(self, env, log_dir, monitor_file=None, max_memory=10000, discount=.9, n_episodes=300,
n_steps=200, batch_size=100, learning_rate = 0.01, dropout_keep_prob = 1.0,
exploration=lambda x: 0.1, stop_training=10):
# Memory replay parameters
self.max_memory = max_memory
self.memory = list() = discount
# episode scores
self.game_scores = list()
self.game_score = 0.
# exploration
self.eps = exploration # epsilon-greedy as function of epoch
# environment parameters
self.env = gym.make(env)
self.monitor_file = monitor_file
self.n_states = self.env.observation_space.shape[0]
self.n_actions = int(re.findall('\d+',str(self.env.action_space))[0]) # shameless hack to get a dim of actions
# training parameters
self.learning_rate = learning_rate
self.n_episodes = n_episodes
self.n_steps = n_steps # must be equal to episode length
self.batch_size = batch_size
self.stop_training = stop_training # stop training after stop_training consecutive wins
self.consec_wins = 0 # number of consecutive wins to stop training
self.global_step = 0 # global step
# Neural Network Parameters
self.n_hidden_1 = self.n_states
# Initialize tensorflow parameters
self.x = tf.placeholder(tf.float32, [None, self.n_states],name='states')
self.y = tf.placeholder(tf.float32, [None, self.n_actions],name='qvals')
self.keep_prob = dropout_keep_prob
self.dropout = tf.placeholder(tf.float32,name='dropout')
# Tensorboard directory - try to clean if exists
self.log_dir = log_dir
# define graph
# update game score
def update_game_score(self,episode_score):
if len(self.game_scores) > 100:
del self.game_scores[0]
self.game_score = np.mean(self.game_scores)
# process reward
def exp_process_reward(self,ts,reward,endgame):
if ts <= self.n_steps-1 and endgame == True:
reward = -1.
elif ts == self.n_steps-1 and endgame == False:
reward = 1.
reward = 0.
return reward
# saving to memory
def exp_save_to_memory(self, states):
if len(self.memory) > self.max_memory:
del self.memory[0]
# getting batch of the memory
def exp_get_batch(self):
len_memory = len(self.memory)
n_examples = min(len_memory, self.batch_size)
inputs = np.zeros((n_examples, self.n_states))
targets = np.zeros((n_examples, self.n_actions))
for i, idx in enumerate(np.random.randint(0, len_memory,size=n_examples)):
states = self.memory[idx]
# input
inputs[i] = states['state_t'].astype(np.float32)
# targets - not correcting those which are not taken, use prediction
feed_dict = {self.x: states['state_t'].reshape(1,-1), self.dropout: self.keep_prob}
targets[i] =, feed_dict)
# acted action
feed_dict = {self.x: states['state_tp1'].reshape(1,-1), self.dropout: self.keep_prob}
Qsa = np.max(, feed_dict))
# check if endgame and if not use Bellman's equation
if states['endgame']:
targets[i,states['action']] = states['reward']
targets[i,states['action']] = states['reward'] + * Qsa
return inputs, targets
# aux to define a weight variable
def tf_weight_variable(self,shape):
initial = tf.truncated_normal(shape, stddev=0.1, dtype=tf.float32)
return tf.Variable(initial)
# aux to define a bias
def tf_bias_variable(self,shape):
initial = tf.constant(.1, shape=shape, dtype=tf.float32)
return tf.Variable(initial)
# aux to attach many summaries
def tf_variable_summaries(self,var, name):
with tf.name_scope('summaries'):
mean = tf.reduce_mean(var)
tf.scalar_summary('mean/' + name, mean)
with tf.name_scope('stddev'):
stddev = tf.sqrt(tf.reduce_sum(tf.square(var - mean)))
tf.scalar_summary('sttdev/' + name, stddev)
tf.scalar_summary('max/' + name, tf.reduce_max(var))
tf.scalar_summary('min/' + name, tf.reduce_min(var))
tf.histogram_summary(name, var)
# Aux function to define layers
def tf_nn_layer(self, input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):
with tf.name_scope(layer_name):
with tf.name_scope('inputs'):
self.tf_variable_summaries(input_tensor, layer_name + '/input')
with tf.name_scope('weights'):
weights = self.tf_weight_variable([input_dim, output_dim])
self.tf_variable_summaries(weights, layer_name + '/weights')
with tf.name_scope('biases'):
biases = self.tf_bias_variable([output_dim])
self.tf_variable_summaries(biases, layer_name + '/biases')
with tf.name_scope('Wx_plus_b'):
preactivate = tf.add(tf.matmul(input_tensor, weights),biases)
tf.histogram_summary(layer_name + '/pre_activations', preactivate)
activations = act(preactivate, 'activation')
tf.histogram_summary(layer_name + '/activations', activations)
return activations
# construct network
def tf_network(self):
hidden1 = self.tf_nn_layer(self.x, self.n_hidden_1, self.n_hidden_1, 'layer1', act=tf.nn.relu)
with tf.name_scope('dropout'):
tf.scalar_summary('dropout_probability', self.dropout)
dropped = tf.nn.dropout(hidden1, self.dropout)
qout = self.tf_nn_layer(dropped, self.n_hidden_1, self.n_actions, 'qvalues', act=tf.identity)
return qout
# Construct model
def tf_define_model(self):
# Init session
self.session = tf.Session()
# Model scope
with tf.name_scope('Model'):
self.predictor = self.tf_network()
# Loss
with tf.name_scope('Loss'):
self.loss = tf.reduce_mean(tf.square(self.y - self.predictor))
# Define optimizer
with tf.name_scope('SGD'):
self.train_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
# Prepare summaries
tf.scalar_summary('loss', self.loss)
# Summary writer
self.merged_summary_op = tf.merge_all_summaries()
self.summary_writer = tf.train.SummaryWriter(self.log_dir + '/train', graph=tf.get_default_graph())
# Initializing the session
# Train loop
def tf_train_model(self):
# start open ai monitor
if self.monitor_file:
# Training cycle
for epoch in range(self.n_episodes):
# restart episode
state_tp1 = self.env.reset()
endgame = False
sum_avg_loss = 0.
sum_max_qval = 0.
n_explorations = 0.
episode_score = 0.
states = {}
for t in range(self.n_steps):
state_t1 = np.array(state_tp1)
# epsilon-greedy exploration
if self.consec_wins < self.stop_training and np.random.rand() <= self.eps(epoch):
n_explorations += 1
action = self.env.action_space.sample()
feed_dict = {self.x: state_t1.reshape(1,-1), self.dropout: self.keep_prob}
qvals =, feed_dict)
sum_max_qval += np.max(qvals)
action = np.argmax(qvals)
# take a next step
state_tp1, reward, endgame, info = self.env.step(action)
# print("{:4d}: {}".format(t,endgame))
# process reward
reward = self.exp_process_reward(t,reward,endgame)
episode_score = episode_score + 1.0
#store experience
states['action'] = action
states['reward'] = float(reward)
states['endgame'] = endgame
states['state_t'] = np.array(state_t1)
states['state_tp1'] = np.array(state_tp1)
# Training loop
if self.game_score < 195:
# get experience replay
x_batch, y_batch = self.exp_get_batch()
# create feed dictionary
feed_dict = {self.x: x_batch, self.y: y_batch, self.dropout: self.keep_prob}
# training
_, loss, summary =[self.train_op, self.loss, self.merged_summary_op],
# add summary to the summary_writer
self.global_step += x_batch.shape[0]
# avg loss
sum_avg_loss += loss
# Check if lost or not
if (endgame == True) or (endgame == False and t == self.n_steps-1):
print("{:4d}: score={:8.1f}, loss={:6.2f}, max qval={:6.2f}, exp={:6.2f}, game score={:6.2f}".
if (t == self.n_steps-1):
self.consec_wins +=1
episode_score = 0
self.consec_wins = 0
episode_score = 0
# close monitor session
if self.monitor_file:
if __name__ == "__main__":
model = ExperienceQModel(
monitor_file = 'results/cartpole',\
log_dir = '/tmp/tf/cartpole-256_1e-3_norm',\
learning_rate = 1.e-3,\
dropout_keep_prob = 1.0,\
exploration = lambda x: (60-x)/100. if x<30 else 0.1,\
stop_training = 10
