Skip to content

Instantly share code, notes, and snippets.

@lemonzi
Created March 13, 2017 22:34
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lemonzi/7910d7202deb6660f80cf3917a1dab0d to your computer and use it in GitHub Desktop.
Save lemonzi/7910d7202deb6660f80cf3917a1dab0d to your computer and use it in GitHub Desktop.
TensorFlow implementation of an Advantage Actor-Critic loss
import numpy as np
import scipy.signal
import tensorflow as tf
import tensorify
@tensorify.tensorflow_op(tf.float32)
def discount(x, gamma=1.0, axis=0):
y = scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=axis)[::-1]
return y.astype(np.float32)
def AdvantageActorCriticLoss(actions, rewards, policy, values, bootstrap,
value_weight=0.5, entropy_weight=1e-4,
gamma=0.9, epsilon=1e-10):
"""Computes the loss for the AAC algorithm.
Adapted from https://github.com/awjuliani/DeepRL-Agents.
Args:
actions: The actual actions that the agent took.
rewards: The reward obtained at each time step.
policy: A differentiable (mu, sigma) tuple with the generated policy.
values: A differentiable vector with the state-value estimates.
bootstrap: The estimated future rewards after the final step.
value_weight: Value loss weight (how close it is to target_values).
entropy_weight: Entropy weight, for exploration encouragement.
gamma: The discount factor for the reward and GAE.
epsilon: Numerical stabilization constant for the entropy logarithm.
Returns:
- A differentiable scalar with the total loss for the batch.
- An op with the merged summaries that have been created.
"""
with tf.name_scope('GeneralizedAdvantageEstimation'):
# Compute the target values (or discounted rewards) bootstrapping with
# the estimated future returns (or value) of the last state.
bootstrap = tf.expand_dims(bootstrap, 0)
episode_length = tf.shape(values)[0]
rewards_plus = tf.concat([rewards, bootstrap], axis=0)
target_values = tf.stop_gradient(
discount(rewards_plus, gamma)[:episode_length])
# Compute the advantages using Generalized Advantage Estimation.
values_plus = tf.concat([values[1:], bootstrap], axis=0)
advantages = tf.stop_gradient(
discount(rewards + gamma * values_plus - values, gamma))
# The differential entropy (valid for continuous policies) of a normal
# distribution, after dropping constants, is the log of the determinant of
# the covariance, which is the sum of the log of the variance of each
# dimension for a diagonal coavariance matrix.
with tf.name_scope('Entropy'):
entropy = tf.reduce_sum(tf.log(policy.sigma + epsilon), axis=1)
entropy_loss = -tf.reduce_sum(entropy)
with tf.name_scope('PolicyLoss'):
# Squared Mahalanobis Distance between predicted and sampled actions.
squared_difference = tf.squared_difference(actions, policy.mu)
squared_distance = tf.reduce_sum(
squared_difference / (policy.sigma + epsilon), axis=1)
# The log of the policy density for the taken action, over all samples.
policy_loss = -tf.reduce_sum((entropy + squared_distance) * advantages)
with tf.name_scope('ValueLoss'):
# Value loss: the critic tries to predict the discounted reward.
value_loss = tf.reduce_sum(
tf.squared_difference(target_values, values))
with tf.name_scope('TotalLoss'):
loss = (policy_loss
+ value_weight * value_loss
+ entropy_weight * entropy_loss)
with tf.name_scope('Summaries'):
float_length = tf.to_float(episode_length)
loss_summaries = [
tf.summary.scalar('Value', value_loss / float_length),
tf.summary.scalar('Policy', policy_loss / float_length),
tf.summary.scalar('Entropy', -entropy_loss / float_length),
tf.summary.scalar('Total', loss / float_length)]
return loss, tf.summary.merge(loss_summaries)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment