Skip to content

Instantly share code, notes, and snippets.

@tambetm
Last active October 19, 2016 20:43
Show Gist options
  • Save tambetm/4c5d936977ae10e512f4b7aa2060ca72 to your computer and use it in GitHub Desktop.
Save tambetm/4c5d936977ae10e512f4b7aa2060ca72 to your computer and use it in GitHub Desktop.

Used normalized advantage functions (NAF) from this paper:

Continuous Deep Q-Learning with Model-based Acceleration
Shixiang Gu, Timothy Lillicrap, Ilya Sutskever, Sergey Levine
http://arxiv.org/abs/1603.00748

The command line used was:

python naf.py InvertedPendulum-v1 --batch_norm --optimizer_lr 0.0001 --noise fixed --noise_scale 0.01 --tau 1 --l2_reg 0.001 --batch_size 1000
import argparse
import gym
from gym.spaces import Box, Discrete
from keras.models import Model
from keras.layers import Input, Dense, Lambda, Reshape, merge
from keras.layers.normalization import BatchNormalization
from keras.regularizers import l2, l1
from keras.constraints import maxnorm, unitnorm
from keras.optimizers import Adam, RMSprop
from keras import backend as K
import theano.tensor as T
import numpy as np
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int, default=100)
parser.add_argument('--hidden_size', type=int, default=100)
parser.add_argument('--layers', type=int, default=2)
parser.add_argument('--batch_norm', action="store_true", default=False)
parser.add_argument('--no_batch_norm', action="store_false", dest="batch_norm")
parser.add_argument('--max_norm', type=int)
parser.add_argument('--unit_norm', action='store_true', default=False)
parser.add_argument('--l2_reg', type=float)
parser.add_argument('--l1_reg', type=float)
parser.add_argument('--min_train', type=int, default=10)
parser.add_argument('--train_repeat', type=int, default=10)
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--tau', type=float, default=0.001)
parser.add_argument('--episodes', type=int, default=200)
parser.add_argument('--max_timesteps', type=int, default=200)
parser.add_argument('--activation', choices=['tanh', 'relu'], default='tanh')
parser.add_argument('--optimizer', choices=['adam', 'rmsprop'], default='adam')
parser.add_argument('--optimizer_lr', type=float, default=0.001)
parser.add_argument('--noise', choices=['linear_decay', 'exp_decay', 'fixed', 'covariance'], default='linear_decay')
parser.add_argument('--noise_scale', type=float, default=0.01)
parser.add_argument('--display', action='store_true', default=True)
parser.add_argument('--no_display', dest='display', action='store_false')
parser.add_argument('--gym_record')
parser.add_argument('environment')
args = parser.parse_args()
assert K._BACKEND == 'theano', "only works with Theano as backend"
# create environment
env = gym.make(args.environment)
assert isinstance(env.observation_space, Box), "observation space must be continuous"
assert isinstance(env.action_space, Box), "action space must be continuous"
assert len(env.action_space.shape) == 1
num_actuators = env.action_space.shape[0]
print "num_actuators:", num_actuators
# start monitor for OpenAI Gym
if args.gym_record:
env.monitor.start(args.gym_record)
# optional norm constraint
if args.max_norm:
W_constraint = maxnorm(args.max_norm)
elif args.unit_norm:
W_constraint = unitnorm()
else:
W_constraint = None
# optional regularizer
if args.l2_reg:
W_regularizer = l2(args.l2_reg)
elif args.l1_reg:
W_regularizer = l1(args.l1_reg)
else:
W_regularizer = None
# helper functions to use with layers
if num_actuators == 1:
# simpler versions for single actuator case
def _L(x):
return K.exp(x)
def _P(x):
return x**2
def _A(t):
m, p, u = t
return -(u - m)**2 * p
def _Q(t):
v, a = t
return v + a
else:
# use Theano advanced operators for multiple actuator case
def _L(x):
# initialize with zeros
batch_size = x.shape[0]
a = T.zeros((batch_size, num_actuators, num_actuators))
# set diagonal elements
batch_idx = T.extra_ops.repeat(T.arange(batch_size), num_actuators)
diag_idx = T.tile(T.arange(num_actuators), batch_size)
b = T.set_subtensor(a[batch_idx, diag_idx, diag_idx], T.flatten(T.exp(x[:, :num_actuators])))
# set lower triangle
cols = np.concatenate([np.array(range(i), dtype=np.uint) for i in xrange(num_actuators)])
rows = np.concatenate([np.array([i]*i, dtype=np.uint) for i in xrange(num_actuators)])
cols_idx = T.tile(T.as_tensor_variable(cols), batch_size)
rows_idx = T.tile(T.as_tensor_variable(rows), batch_size)
batch_idx = T.extra_ops.repeat(T.arange(batch_size), len(cols))
c = T.set_subtensor(b[batch_idx, rows_idx, cols_idx], T.flatten(x[:, num_actuators:]))
return c
def _P(x):
return K.batch_dot(x, K.permute_dimensions(x, (0,2,1)))
def _A(t):
m, p, u = t
d = K.expand_dims(u - m, -1)
return -K.batch_dot(K.batch_dot(K.permute_dimensions(d, (0,2,1)), p), d)
def _Q(t):
v, a = t
return v + a
# helper function to produce layers twice
def createLayers():
x = Input(shape=env.observation_space.shape, name='x')
u = Input(shape=env.action_space.shape, name='u')
if args.batch_norm:
h = BatchNormalization()(x)
else:
h = x
for i in xrange(args.layers):
h = Dense(args.hidden_size, activation=args.activation, name='h'+str(i+1),
W_constraint=W_constraint, W_regularizer=W_regularizer)(h)
if args.batch_norm and i != args.layers - 1:
h = BatchNormalization()(h)
v = Dense(1, name='v', W_constraint=W_constraint, W_regularizer=W_regularizer)(h)
m = Dense(num_actuators, name='m', W_constraint=W_constraint, W_regularizer=W_regularizer)(h)
l0 = Dense(num_actuators * (num_actuators + 1)/2, name='l0',
W_constraint=W_constraint, W_regularizer=W_regularizer)(h)
l = Lambda(_L, output_shape=(num_actuators, num_actuators), name='l')(l0)
p = Lambda(_P, output_shape=(num_actuators, num_actuators), name='p')(l)
a = merge([m, p, u], mode=_A, output_shape=(None, num_actuators,), name="a")
q = merge([v, a], mode=_Q, output_shape=(None, num_actuators,), name="q")
return x, u, m, v, q, p, a
x, u, m, v, q, p, a = createLayers()
# wrappers around computational graph
fmu = K.function([K.learning_phase(), x], m)
mu = lambda x: fmu([0, x])
fP = K.function([K.learning_phase(), x], p)
P = lambda x: fP([0, x])
fA = K.function([K.learning_phase(), x, u], a)
A = lambda x, u: fA([0, x, u])
fQ = K.function([K.learning_phase(), x, u], q)
Q = lambda x, u: fQ([0, x, u])
# main model
model = Model(input=[x,u], output=q)
model.summary()
if args.optimizer == 'adam':
optimizer = Adam(args.optimizer_lr)
elif args.optimizer == 'rmsprop':
optimizer = RMSprop(args.optimizer_lr)
else:
assert False
model.compile(optimizer=optimizer, loss='mse')
# another set of layers for target model
x, u, m, v, q, p, a = createLayers()
# V() function uses target model weights
fV = K.function([K.learning_phase(), x], v)
V = lambda x: fV([0, x])
# target model is initialized from main model
target_model = Model(input=[x,u], output=q)
target_model.set_weights(model.get_weights())
# replay memory
prestates = []
actions = []
rewards = []
poststates = []
terminals = []
# the main learning loop
total_reward = 0
for i_episode in xrange(args.episodes):
observation = env.reset()
#print "initial state:", observation
episode_reward = 0
for t in xrange(args.max_timesteps):
if args.display:
env.render()
# predict the mean action from current observation
x = np.array([observation])
u = mu(x)[0]
# add exploration noise to the action
if args.noise == 'linear_decay':
action = u + np.random.randn(num_actuators) / (i_episode + 1)
elif args.noise == 'exp_decay':
action = u + np.random.randn(num_actuators) * 10 ** -i_episode
elif args.noise == 'fixed':
action = u + np.random.randn(num_actuators) * args.noise_scale
elif args.noise == 'covariance':
if num_actuators == 1:
std = np.minimum(args.noise_scale / P(x)[0], 1)
#print "std:", std
action = np.random.normal(u, std, size=(1,))
else:
cov = np.minimum(np.linalg.inv(P(x)[0]) * args.noise_scale, 1)
#print "covariance:", cov
action = np.random.multivariate_normal(u, cov)
else:
assert False
#print "action:", action, "Q:", Q(x, np.array([action])), "V:", V(x)
#print "action:", action, "advantage:", A(x, np.array([action]))
#print "mu:", u, "action:", action
#print "Q(mu):", Q(x, np.array([u])), "Q(action):", Q(x, np.array([action]))
# add current observation and action to replay memory
prestates.append(observation)
actions.append(action)
#print "prestate:", observation
# take the action and record reward
observation, reward, done, info = env.step(action)
episode_reward += reward
#print "reward:", reward
#print "poststate:", observation
# add reward, resulting observation and terminal indicator to replay memory
rewards.append(reward)
poststates.append(observation)
terminals.append(done)
#print "done:", done
# start training after min_train experiences are recorded in replay memory
if len(prestates) > args.min_train:
loss = 0
# perform train_repeat Q-updates
for k in xrange(args.train_repeat):
# if less experiences than batch size, then use all of them
if len(prestates) > args.batch_size:
indexes = np.random.choice(len(prestates), size=args.batch_size)
else:
indexes = range(len(prestates))
# Q-update
v = V(np.array(poststates)[indexes])
y = np.array(rewards)[indexes] + args.gamma * np.squeeze(v)
loss += model.train_on_batch([np.array(prestates)[indexes], np.array(actions)[indexes]], y)
# copy weights to target model, averaged by tau
weights = model.get_weights()
target_weights = target_model.get_weights()
for i in xrange(len(weights)):
target_weights[i] = args.tau * weights[i] + (1 - args.tau) * target_weights[i]
target_model.set_weights(target_weights)
#print "average loss:", loss/k
if done:
break
print "Episode {} finished after {} timesteps, reward {}".format(i_episode + 1, t + 1, episode_reward)
total_reward += episode_reward
print "Average reward per episode {}".format(total_reward / args.episodes)
if args.gym_record:
env.monitor.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment