Skip to content

Instantly share code, notes, and snippets.

@heerad
Created July 24, 2017 06:43
Show Gist options
  • Save heerad/bf25b07c863a63d857531db35c15e61b to your computer and use it in GitHub Desktop.
Save heerad/bf25b07c863a63d857531db35c15e61b to your computer and use it in GitHub Desktop.
import numpy as np
import gym
from gym import wrappers
import tensorflow as tf
import json, sys, os
from os import path
#####################################################################################################
## Algorithm
# Actor-Critic with eligibility traces
# A policy gradient algorithm that uses "backward view" TD(\lambda) for updating the actor and critic
# via eligibility traces in an online fashion (per time-step rather than episode).
# Neural networks are used for function approximation.
# A slowly-changing "target" critic network, as well as gradient norm clipping, are used to improve
# stability and encourage convergence.
# Parameter updates are made via RMSProp.
#####################################################################################################
## Setup
env_to_use = 'Acrobot-v1'
# hyperparameters
gamma = 1 # reward discount factor
lambda_actor = 0.9 # TD(\lambda) parameter (0: 1-step TD, 1: MC) for actor
lambda_critic = 0.9 # TD(\lambda) parameter (0: 1-step TD, 1: MC) for critic
lambda_decay = 1 # decay rate for lambda parameters
h1_actor = 8 # hidden layer 1 size for actor
h2_actor = 8 # hidden layer 2 size for actor
h3_actor = 8 # hidden layer 3 size for actor
h1_critic = 8 # hidden layer 1 size for critic
h2_critic = 8 # hidden layer 2 size for critic
h3_critic = 8 # hidden layer 3 size for critic
lr_actor = 5e-5 # learning rate for actor
lr_critic = 5e-5 # learning rate for critic
lr_decay = .9995 # learning rate decay (per episode)
use_rmsprop = False # whether to use RMSprop or SGD
rmsprop_decay = 0.99 # Decay rate for RMSProp optimization procedure
rmsprop_eps = 1e-7 # Epsilon for RMSProp optimization procedure
l2_reg_actor = 1e-6 # L2 regularization factor for actor
l2_reg_critic = 1e-6 # L2 regularization factor for critic
dropout_actor = 0 # dropout rate for actor (0 = no dropout)
dropout_critic = 0 # dropout rate for critic (0 = no dropout)
num_episodes = 50000 # number of episodes
max_steps_ep = 10000 # default max number of steps per episode (unless env has a lower hardcoded limit)
clip_norm = 10 # maximum gradient norm for clipping
slow_critic_burnin = 100 # number of steps where slow critic weights are tied to critic weights
update_slow_critic_every = 100 # number of steps to use slow critic as target before updating it to latest critic
# game parameters
env = gym.make(env_to_use)
state_dim = np.prod(np.array(env.observation_space.shape)) # Get total number of dimensions in state
n_actions = env.action_space.n # Assuming discrete action space
# prepare monitorings
outdir = '/tmp/td_lambda_actor_critic-agent-results'
env = wrappers.Monitor(env, outdir, force=True)
def writefile(fname, s):
with open(path.join(outdir, fname), 'w') as fh: fh.write(s)
info = {}
info['env_id'] = env.spec.id
info['params'] = dict(
gamma = gamma,
lambda_actor = lambda_actor,
lambda_critic = lambda_critic,
lambda_decay = lambda_decay,
h1_actor = h1_actor,
h2_actor = h2_actor,
h3_actor = h3_actor,
h1_critic = h1_critic,
h2_critic = h2_critic,
h3_critic = h3_critic,
lr_actor = lr_actor,
lr_critic = lr_critic,
lr_decay = lr_decay,
use_rmsprop = use_rmsprop,
rmsprop_decay = rmsprop_decay,
rmsprop_eps = rmsprop_eps,
l2_reg_actor = l2_reg_actor,
l2_reg_critic = l2_reg_critic,
dropout_actor = dropout_actor,
dropout_critic = dropout_critic,
num_episodes = num_episodes,
clip_norm = clip_norm,
slow_critic_burnin = slow_critic_burnin,
update_slow_critic_every = update_slow_critic_every
)
#####################################################################################################
## Tensorflow
tf.reset_default_graph()
# placeholders
state_ph = tf.placeholder(dtype=tf.float32, shape=[1,state_dim]) # Needs to be rank >=2
action_ph = tf.placeholder(dtype=tf.int32, shape=()) # for computing policy gradient for action taken
delta_ph = tf.placeholder(dtype=tf.float32, shape=()) # R + gamma*V(S') - V(S) -- for computing grad steps
training_actor = tf.placeholder(dtype=tf.bool, shape=()) # determines how to handle dropout in the actor
# episode counter
episodes = tf.Variable(0.0, trainable=False, name='episodes')
episode_inc_op = episodes.assign_add(1)
# actor network
with tf.variable_scope('actor', reuse=False):
actor_hidden = tf.layers.dense(state_ph, h1_actor, activation = tf.nn.relu)
actor_hidden_drop = tf.layers.dropout(actor_hidden, rate = dropout_actor, training = training_actor)
actor_hidden_2 = tf.layers.dense(actor_hidden_drop, h2_actor, activation = tf.nn.relu)
actor_hidden_drop_2 = tf.layers.dropout(actor_hidden_2, rate = dropout_actor, training = training_actor)
actor_hidden_3 = tf.layers.dense(actor_hidden_drop_2, h3_actor, activation = tf.nn.relu)
actor_hidden_drop_3 = tf.layers.dropout(actor_hidden_3, rate = dropout_actor, training = training_actor)
actor_logits = tf.squeeze(tf.layers.dense(actor_hidden_drop_3, n_actions))
actor_logits -= tf.reduce_max(actor_logits) # for numerical stability
actor_policy = tf.nn.softmax(actor_logits)
actor_logprob_action = actor_logits[action_ph] - tf.reduce_logsumexp(actor_logits)
# need a function so that we can create the same graph (but with different weights) for normal critic, and
# the slowly-changing critic used as a target
def generate_critic_network(s, trainable):
critic_hidden = tf.layers.dense(s, h1_critic, activation = tf.nn.relu, trainable = trainable)
critic_hidden_drop = tf.layers.dropout(critic_hidden, rate = dropout_critic, training = trainable)
critic_hidden_2 = tf.layers.dense(critic_hidden_drop, h2_critic, activation = tf.nn.relu, trainable = trainable)
critic_hidden_drop_2 = tf.layers.dropout(critic_hidden_2, rate = dropout_critic, training = trainable)
critic_hidden_3 = tf.layers.dense(critic_hidden_drop_2, h3_critic, activation = tf.nn.relu, trainable = trainable)
critic_hidden_drop_3 = tf.layers.dropout(critic_hidden_3, rate = dropout_critic, training = trainable)
critic_value = tf.squeeze(tf.layers.dense(critic_hidden_drop_3, 1, trainable = trainable))
return critic_value
# critic network
with tf.variable_scope('critic', reuse=False):
critic_value = generate_critic_network(state_ph, trainable = True)
# slowly-changing critic network (for targets)
with tf.variable_scope('slow_critic', reuse=False):
critic_value_slow = generate_critic_network(state_ph, trainable = False)
# isolate vars for each network
actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor')
critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic')
slow_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='slow_critic')
# update values for slowly-changing critic network to match current critic network
update_slow_critic_ops = []
for i, slow_critic_var in enumerate(slow_critic_vars):
update_slow_critic_op = slow_critic_var.assign(critic_vars[i])
update_slow_critic_ops.append(update_slow_critic_op)
update_slow_critic_op = tf.group(*update_slow_critic_ops, name='update_slow_critic')
# gradients
actor_logprob_action_grads = tf.gradients(actor_logprob_action, actor_vars)
critic_value_grads = tf.gradients(critic_value, critic_vars)
# collect stuff for actor and critic updates
ac_update_inputs = dict(
actor = dict(
vars = actor_vars,
grads = actor_logprob_action_grads,
lambda_ = lambda_actor,
lr = lr_actor,
l2_reg = l2_reg_actor
),
critic = dict(
vars = critic_vars,
grads = critic_value_grads,
lambda_ = lambda_critic,
lr = lr_critic,
l2_reg = l2_reg_critic
)
)
# gradient step ops with eligibility traces
gradient_step_ops = []
trace_reset_ops = []
for network in ac_update_inputs: # actor and critic
net_update_inputs = ac_update_inputs[network]
with tf.variable_scope(network+'/traces'):
for var, grad in zip(net_update_inputs['vars'], net_update_inputs['grads']):
lr = net_update_inputs['lr'] * lr_decay**episodes
l2_reg = net_update_inputs['l2_reg']
if 'bias' in var.name: l2_reg = 0 # don't regularize biases
lambda_ = net_update_inputs['lambda_'] * lambda_decay**episodes
# Elig trace update: e <- gamma*lambda*e + grad
trace = tf.Variable(tf.zeros(grad.get_shape()), trainable=False, name='trace')
trace_op = trace.assign(gamma*lambda_*trace + tf.clip_by_norm(grad, clip_norm = clip_norm))
if use_rmsprop:
# Collect total gradient (objective function + regularization)
grad_tot = tf.Variable(tf.zeros(grad.get_shape()), trainable=False, name='grad_tot')
grad_tot_op = grad_tot.assign(delta_ph*trace_op - l2_reg*var)
# RMSProp cache update: c <- decay*c + (1-decay)*dX^2
cache = tf.Variable(tf.zeros(grad.get_shape()), trainable=False, name='cache')
cache_op = cache.assign(rmsprop_decay*cache + (1-rmsprop_decay)*tf.square(grad_tot_op))
# RMSProp step
grad_step_op = var.assign_add(lr * grad_tot / tf.sqrt(cache_op + rmsprop_eps))
else:
# Gradient step, including for L2 regularization
grad_step_op = var.assign_add(lr * (delta_ph*trace_op - l2_reg*var))
# Need to reset eligibility traces every episode
trace_reset_op = trace.assign(tf.zeros(trace.get_shape()))
# Collect each of these operations per trainable variable
gradient_step_ops.append(grad_step_op)
trace_reset_ops.append(trace_reset_op)
td_lambda_op = tf.group(*gradient_step_ops, name='td_lambda')
trace_reset_op = tf.group(*trace_reset_ops, name='trace_reset')
# initialize session
sess = tf.Session()
sess.run(tf.global_variables_initializer())
#####################################################################################################
## Training
total_steps = 0
for ep in range(num_episodes):
total_reward = 0
steps_in_ep = 0
v_s = []
# Initial state
observation = env.reset()
# env.render()
for t in range(max_steps_ep):
# compute value of current state and action probabilities
state_value, action_probs = sess.run([critic_value, actor_policy],
feed_dict={state_ph: observation[None], training_actor: False})
v_s.append(state_value)
# get action
action = np.random.choice(n_actions, p=action_probs)
# take step
next_observation, reward, done, _info = env.step(action)
# env.render()
total_reward += reward
# update the slow critic's weights to match the latest critic if it's time to do so
if total_steps%update_slow_critic_every == 0:
_ = sess.run(update_slow_critic_op)
# compute value of next state
if done and not env.env._past_limit(): # only consider next state the end state if it wasn't due to timeout
# if done:
next_state_value = 0
elif total_steps < slow_critic_burnin: # don't use the slowly-changing critic just yet
next_state_value = sess.run(critic_value, feed_dict={state_ph: next_observation[None]})
else: # use a slowly-changing critic for the target to improve training stability
next_state_value = sess.run(critic_value_slow, feed_dict={state_ph: next_observation[None]})
# compute TD error
delta = reward + gamma*next_state_value - state_value
# update actor and critic params using TD(lambda)
_ = sess.run(td_lambda_op,
feed_dict={state_ph: observation[None], action_ph: action, delta_ph: delta, training_actor: True})
observation = next_observation
total_steps += 1
steps_in_ep += 1
if done:
# Reset eligibility traces to 0
_ = sess.run(trace_reset_op)
# Increment episode counter
_ = sess.run(episode_inc_op)
break
print('Episode %2i, Reward: %7.3f, Steps: %i'%(ep,total_reward,steps_in_ep))
print(np.mean(v_s))
# Finalize and upload results
writefile('info.json', json.dumps(info))
env.close()
gym.upload(outdir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment