-
-
Save heerad/1983d50c6657a55298b67e69a2ceeb44 to your computer and use it in GitHub Desktop.
import numpy as np | |
import gym | |
from gym import wrappers | |
import tensorflow as tf | |
import json, sys, os | |
from os import path | |
import random | |
from collections import deque | |
##################################################################################################### | |
## Algorithm | |
# Deep Deterministic Policy Gradient (DDPG) | |
# An off-policy actor-critic algorithm that uses additive exploration noise (e.g. an Ornstein-Uhlenbeck process) on top | |
# of a deterministic policy to generate experiences (s, a, r, s'). It uses minibatches of these experiences from replay | |
# memory to update the actor (policy) and critic (Q function) parameters. | |
# Neural networks are used for function approximation. | |
# Slowly-changing "target" networks are used to improve stability and encourage convergence. | |
# Parameter updates are made via Adam. | |
# Assumes continuous action spaces! | |
##################################################################################################### | |
## Setup | |
env_to_use = 'Pendulum-v0' | |
# hyperparameters | |
gamma = 0.99 # reward discount factor | |
h1_actor = 8 # hidden layer 1 size for the actor | |
h2_actor = 8 # hidden layer 2 size for the actor | |
h3_actor = 8 # hidden layer 3 size for the actor | |
h1_critic = 8 # hidden layer 1 size for the critic | |
h2_critic = 8 # hidden layer 2 size for the critic | |
h3_critic = 8 # hidden layer 3 size for the critic | |
lr_actor = 1e-3 # learning rate for the actor | |
lr_critic = 1e-3 # learning rate for the critic | |
lr_decay = 1 # learning rate decay (per episode) | |
l2_reg_actor = 1e-6 # L2 regularization factor for the actor | |
l2_reg_critic = 1e-6 # L2 regularization factor for the critic | |
dropout_actor = 0 # dropout rate for actor (0 = no dropout) | |
dropout_critic = 0 # dropout rate for critic (0 = no dropout) | |
num_episodes = 15000 # number of episodes | |
max_steps_ep = 10000 # default max number of steps per episode (unless env has a lower hardcoded limit) | |
tau = 1e-2 # soft target update rate | |
train_every = 1 # number of steps to run the policy (and collect experience) before updating network weights | |
replay_memory_capacity = int(1e5) # capacity of experience replay memory | |
minibatch_size = 1024 # size of minibatch from experience replay memory for updates | |
initial_noise_scale = 0.1 # scale of the exploration noise process (1.0 is the range of each action dimension) | |
noise_decay = 0.99 # decay rate (per episode) of the scale of the exploration noise process | |
exploration_mu = 0.0 # mu parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt | |
exploration_theta = 0.15 # theta parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt | |
exploration_sigma = 0.2 # sigma parameter for the exploration noise process: dXt = theta*(mu-Xt )*dt + sigma*dWt | |
# game parameters | |
env = gym.make(env_to_use) | |
state_dim = np.prod(np.array(env.observation_space.shape)) # Get total number of dimensions in state | |
action_dim = np.prod(np.array(env.action_space.shape)) # Assuming continuous action space | |
# set seeds to 0 | |
env.seed(0) | |
np.random.seed(0) | |
# prepare monitorings | |
outdir = '/tmp/ddpg-agent-results' | |
env = wrappers.Monitor(env, outdir, force=True) | |
def writefile(fname, s): | |
with open(path.join(outdir, fname), 'w') as fh: fh.write(s) | |
info = {} | |
info['env_id'] = env.spec.id | |
info['params'] = dict( | |
gamma = gamma, | |
h1_actor = h1_actor, | |
h2_actor = h2_actor, | |
h3_actor = h3_actor, | |
h1_critic = h1_critic, | |
h2_critic = h2_critic, | |
h3_critic = h3_critic, | |
lr_actor = lr_actor, | |
lr_critic = lr_critic, | |
lr_decay = lr_decay, | |
l2_reg_actor = l2_reg_actor, | |
l2_reg_critic = l2_reg_critic, | |
dropout_actor = dropout_actor, | |
dropout_critic = dropout_critic, | |
num_episodes = num_episodes, | |
max_steps_ep = max_steps_ep, | |
tau = tau, | |
train_every = train_every, | |
replay_memory_capacity = replay_memory_capacity, | |
minibatch_size = minibatch_size, | |
initial_noise_scale = initial_noise_scale, | |
noise_decay = noise_decay, | |
exploration_mu = exploration_mu, | |
exploration_theta = exploration_theta, | |
exploration_sigma = exploration_sigma | |
) | |
np.set_printoptions(threshold=np.nan) | |
replay_memory = deque(maxlen=replay_memory_capacity) # used for O(1) popleft() operation | |
def add_to_memory(experience): | |
replay_memory.append(experience) | |
def sample_from_memory(minibatch_size): | |
return random.sample(replay_memory, minibatch_size) | |
##################################################################################################### | |
## Tensorflow | |
tf.reset_default_graph() | |
# placeholders | |
state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim]) | |
action_ph = tf.placeholder(dtype=tf.float32, shape=[None,action_dim]) | |
reward_ph = tf.placeholder(dtype=tf.float32, shape=[None]) | |
next_state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim]) | |
is_not_terminal_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # indicators (go into target computation) | |
is_training_ph = tf.placeholder(dtype=tf.bool, shape=()) # for dropout | |
# episode counter | |
episodes = tf.Variable(0.0, trainable=False, name='episodes') | |
episode_inc_op = episodes.assign_add(1) | |
# will use this to initialize both the actor network its slowly-changing target network with same structure | |
def generate_actor_network(s, trainable, reuse): | |
hidden = tf.layers.dense(s, h1_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense', reuse = reuse) | |
hidden_drop = tf.layers.dropout(hidden, rate = dropout_actor, training = trainable & is_training_ph) | |
hidden_2 = tf.layers.dense(hidden_drop, h2_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense_1', reuse = reuse) | |
hidden_drop_2 = tf.layers.dropout(hidden_2, rate = dropout_actor, training = trainable & is_training_ph) | |
hidden_3 = tf.layers.dense(hidden_drop_2, h3_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense_2', reuse = reuse) | |
hidden_drop_3 = tf.layers.dropout(hidden_3, rate = dropout_actor, training = trainable & is_training_ph) | |
actions_unscaled = tf.layers.dense(hidden_drop_3, action_dim, trainable = trainable, name = 'dense_3', reuse = reuse) | |
actions = env.action_space.low + tf.nn.sigmoid(actions_unscaled)*(env.action_space.high - env.action_space.low) # bound the actions to the valid range | |
return actions | |
# actor network | |
with tf.variable_scope('actor'): | |
# Policy's outputted action for each state_ph (for generating actions and training the critic) | |
actions = generate_actor_network(state_ph, trainable = True, reuse = False) | |
# slow target actor network | |
with tf.variable_scope('slow_target_actor', reuse=False): | |
# Slow target policy's outputted action for each next_state_ph (for training the critic) | |
# use stop_gradient to treat the output values as constant targets when doing backprop | |
slow_target_next_actions = tf.stop_gradient(generate_actor_network(next_state_ph, trainable = False, reuse = False)) | |
# will use this to initialize both the critic network its slowly-changing target network with same structure | |
def generate_critic_network(s, a, trainable, reuse): | |
state_action = tf.concat([s, a], axis=1) | |
hidden = tf.layers.dense(state_action, h1_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense', reuse = reuse) | |
hidden_drop = tf.layers.dropout(hidden, rate = dropout_critic, training = trainable & is_training_ph) | |
hidden_2 = tf.layers.dense(hidden_drop, h2_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense_1', reuse = reuse) | |
hidden_drop_2 = tf.layers.dropout(hidden_2, rate = dropout_critic, training = trainable & is_training_ph) | |
hidden_3 = tf.layers.dense(hidden_drop_2, h3_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense_2', reuse = reuse) | |
hidden_drop_3 = tf.layers.dropout(hidden_3, rate = dropout_critic, training = trainable & is_training_ph) | |
q_values = tf.layers.dense(hidden_drop_3, 1, trainable = trainable, name = 'dense_3', reuse = reuse) | |
return q_values | |
with tf.variable_scope('critic') as scope: | |
# Critic applied to state_ph and a given action (for training critic) | |
q_values_of_given_actions = generate_critic_network(state_ph, action_ph, trainable = True, reuse = False) | |
# Critic applied to state_ph and the current policy's outputted actions for state_ph (for training actor via deterministic policy gradient) | |
q_values_of_suggested_actions = generate_critic_network(state_ph, actions, trainable = True, reuse = True) | |
# slow target critic network | |
with tf.variable_scope('slow_target_critic', reuse=False): | |
# Slow target critic applied to slow target actor's outputted actions for next_state_ph (for training critic) | |
slow_q_values_next = tf.stop_gradient(generate_critic_network(next_state_ph, slow_target_next_actions, trainable = False, reuse = False)) | |
# isolate vars for each network | |
actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor') | |
slow_target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='slow_target_actor') | |
critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic') | |
slow_target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='slow_target_critic') | |
# update values for slowly-changing targets towards current actor and critic | |
update_slow_target_ops = [] | |
for i, slow_target_actor_var in enumerate(slow_target_actor_vars): | |
update_slow_target_actor_op = slow_target_actor_var.assign(tau*actor_vars[i]+(1-tau)*slow_target_actor_var) | |
update_slow_target_ops.append(update_slow_target_actor_op) | |
for i, slow_target_var in enumerate(slow_target_critic_vars): | |
update_slow_target_critic_op = slow_target_var.assign(tau*critic_vars[i]+(1-tau)*slow_target_var) | |
update_slow_target_ops.append(update_slow_target_critic_op) | |
update_slow_targets_op = tf.group(*update_slow_target_ops, name='update_slow_targets') | |
# One step TD targets y_i for (s,a) from experience replay | |
# = r_i + gamma*Q_slow(s',mu_slow(s')) if s' is not terminal | |
# = r_i if s' terminal | |
targets = tf.expand_dims(reward_ph, 1) + tf.expand_dims(is_not_terminal_ph, 1) * gamma * slow_q_values_next | |
# 1-step temporal difference errors | |
td_errors = targets - q_values_of_given_actions | |
# critic loss function (mean-square value error with regularization) | |
critic_loss = tf.reduce_mean(tf.square(td_errors)) | |
for var in critic_vars: | |
if not 'bias' in var.name: | |
critic_loss += l2_reg_critic * 0.5 * tf.nn.l2_loss(var) | |
# critic optimizer | |
critic_train_op = tf.train.AdamOptimizer(lr_critic*lr_decay**episodes).minimize(critic_loss) | |
# actor loss function (mean Q-values under current policy with regularization) | |
actor_loss = -1*tf.reduce_mean(q_values_of_suggested_actions) | |
for var in actor_vars: | |
if not 'bias' in var.name: | |
actor_loss += l2_reg_actor * 0.5 * tf.nn.l2_loss(var) | |
# actor optimizer | |
# the gradient of the mean Q-values wrt actor params is the deterministic policy gradient (keeping critic params fixed) | |
actor_train_op = tf.train.AdamOptimizer(lr_actor*lr_decay**episodes).minimize(actor_loss, var_list=actor_vars) | |
# initialize session | |
sess = tf.Session() | |
sess.run(tf.global_variables_initializer()) | |
##################################################################################################### | |
## Training | |
total_steps = 0 | |
for ep in range(num_episodes): | |
total_reward = 0 | |
steps_in_ep = 0 | |
# Initialize exploration noise process | |
noise_process = np.zeros(action_dim) | |
noise_scale = (initial_noise_scale * noise_decay**ep) * (env.action_space.high - env.action_space.low) | |
# Initial state | |
observation = env.reset() | |
if ep%10 == 0: env.render() | |
for t in range(max_steps_ep): | |
# choose action based on deterministic policy | |
action_for_state, = sess.run(actions, | |
feed_dict = {state_ph: observation[None], is_training_ph: False}) | |
# add temporally-correlated exploration noise to action (using an Ornstein-Uhlenbeck process) | |
# print(action_for_state) | |
noise_process = exploration_theta*(exploration_mu - noise_process) + exploration_sigma*np.random.randn(action_dim) | |
# print(noise_scale*noise_process) | |
action_for_state += noise_scale*noise_process | |
# take step | |
next_observation, reward, done, _info = env.step(action_for_state) | |
if ep%10 == 0: env.render() | |
total_reward += reward | |
add_to_memory((observation, action_for_state, reward, next_observation, | |
# is next_observation a terminal state? | |
# 0.0 if done and not env.env._past_limit() else 1.0)) | |
0.0 if done else 1.0)) | |
# update network weights to fit a minibatch of experience | |
if total_steps%train_every == 0 and len(replay_memory) >= minibatch_size: | |
# grab N (s,a,r,s') tuples from replay memory | |
minibatch = sample_from_memory(minibatch_size) | |
# update the critic and actor params using mean-square value error and deterministic policy gradient, respectively | |
_, _ = sess.run([critic_train_op, actor_train_op], | |
feed_dict = { | |
state_ph: np.asarray([elem[0] for elem in minibatch]), | |
action_ph: np.asarray([elem[1] for elem in minibatch]), | |
reward_ph: np.asarray([elem[2] for elem in minibatch]), | |
next_state_ph: np.asarray([elem[3] for elem in minibatch]), | |
is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch]), | |
is_training_ph: True}) | |
# update slow actor and critic targets towards current actor and critic | |
_ = sess.run(update_slow_targets_op) | |
observation = next_observation | |
total_steps += 1 | |
steps_in_ep += 1 | |
if done: | |
# Increment episode counter | |
_ = sess.run(episode_inc_op) | |
break | |
print('Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f'%(ep,total_reward,steps_in_ep, noise_scale)) | |
# Finalize and upload results | |
writefile('info.json', json.dumps(info)) | |
env.close() | |
gym.upload(outdir) |
Hi, Thanks for sharing. For the actor loss here actor_loss = -1*tf.reduce_mean(q_values_of_suggested_actions)
, looks like you are just considering the contribution of critic. But in the original paper, deterministic policy gradient has two components ∇θμJ≈Est∼ρβ ∇θμQ(s,a|θQ)|s=st,a=μ(st|θμ) =E β ∇ Q(s,a|θQ)| ∇ μ(s|θμ)|
. In your implementation the contribution of actor network is missing.
Hi, Thanks for sharing. For the actor loss here
actor_loss = -1*tf.reduce_mean(q_values_of_suggested_actions)
, looks like you are just considering the contribution of critic. But in the original paper, deterministic policy gradient has two components∇θμJ≈Est∼ρβ ∇θμQ(s,a|θQ)|s=st,a=μ(st|θμ) =E β ∇ Q(s,a|θQ)| ∇ μ(s|θμ)|
. In your implementation the contribution of actor network is missing.
@mehdimashayekhi This is not the case. q_values_of_suggested_actions
passes actions
through the critic network, where actions
come from a forward pass through the actor network. This is where the contribution of the actor network comes from.
@lerrytang, the var_list is chosen as only the actor in the actor update step.