Created
March 13, 2017 22:34
-
-
Save lemonzi/7910d7202deb6660f80cf3917a1dab0d to your computer and use it in GitHub Desktop.
TensorFlow implementation of an Advantage Actor-Critic loss
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import scipy.signal | |
import tensorflow as tf | |
import tensorify | |
@tensorify.tensorflow_op(tf.float32) | |
def discount(x, gamma=1.0, axis=0): | |
y = scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=axis)[::-1] | |
return y.astype(np.float32) | |
def AdvantageActorCriticLoss(actions, rewards, policy, values, bootstrap, | |
value_weight=0.5, entropy_weight=1e-4, | |
gamma=0.9, epsilon=1e-10): | |
"""Computes the loss for the AAC algorithm. | |
Adapted from https://github.com/awjuliani/DeepRL-Agents. | |
Args: | |
actions: The actual actions that the agent took. | |
rewards: The reward obtained at each time step. | |
policy: A differentiable (mu, sigma) tuple with the generated policy. | |
values: A differentiable vector with the state-value estimates. | |
bootstrap: The estimated future rewards after the final step. | |
value_weight: Value loss weight (how close it is to target_values). | |
entropy_weight: Entropy weight, for exploration encouragement. | |
gamma: The discount factor for the reward and GAE. | |
epsilon: Numerical stabilization constant for the entropy logarithm. | |
Returns: | |
- A differentiable scalar with the total loss for the batch. | |
- An op with the merged summaries that have been created. | |
""" | |
with tf.name_scope('GeneralizedAdvantageEstimation'): | |
# Compute the target values (or discounted rewards) bootstrapping with | |
# the estimated future returns (or value) of the last state. | |
bootstrap = tf.expand_dims(bootstrap, 0) | |
episode_length = tf.shape(values)[0] | |
rewards_plus = tf.concat([rewards, bootstrap], axis=0) | |
target_values = tf.stop_gradient( | |
discount(rewards_plus, gamma)[:episode_length]) | |
# Compute the advantages using Generalized Advantage Estimation. | |
values_plus = tf.concat([values[1:], bootstrap], axis=0) | |
advantages = tf.stop_gradient( | |
discount(rewards + gamma * values_plus - values, gamma)) | |
# The differential entropy (valid for continuous policies) of a normal | |
# distribution, after dropping constants, is the log of the determinant of | |
# the covariance, which is the sum of the log of the variance of each | |
# dimension for a diagonal coavariance matrix. | |
with tf.name_scope('Entropy'): | |
entropy = tf.reduce_sum(tf.log(policy.sigma + epsilon), axis=1) | |
entropy_loss = -tf.reduce_sum(entropy) | |
with tf.name_scope('PolicyLoss'): | |
# Squared Mahalanobis Distance between predicted and sampled actions. | |
squared_difference = tf.squared_difference(actions, policy.mu) | |
squared_distance = tf.reduce_sum( | |
squared_difference / (policy.sigma + epsilon), axis=1) | |
# The log of the policy density for the taken action, over all samples. | |
policy_loss = -tf.reduce_sum((entropy + squared_distance) * advantages) | |
with tf.name_scope('ValueLoss'): | |
# Value loss: the critic tries to predict the discounted reward. | |
value_loss = tf.reduce_sum( | |
tf.squared_difference(target_values, values)) | |
with tf.name_scope('TotalLoss'): | |
loss = (policy_loss | |
+ value_weight * value_loss | |
+ entropy_weight * entropy_loss) | |
with tf.name_scope('Summaries'): | |
float_length = tf.to_float(episode_length) | |
loss_summaries = [ | |
tf.summary.scalar('Value', value_loss / float_length), | |
tf.summary.scalar('Policy', policy_loss / float_length), | |
tf.summary.scalar('Entropy', -entropy_loss / float_length), | |
tf.summary.scalar('Total', loss / float_length)] | |
return loss, tf.summary.merge(loss_summaries) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment