lemonzi/a3c.py

## a3c.py
import numpy as np
import scipy.signal
import tensorflow as tf
import tensorify


@tensorify.tensorflow_op(tf.float32)
def discount(x, gamma=1.0, axis=0):
    y = scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=axis)[::-1]
    return y.astype(np.float32)


def AdvantageActorCriticLoss(actions, rewards, policy, values, bootstrap,
                             value_weight=0.5, entropy_weight=1e-4,
                             gamma=0.9, epsilon=1e-10):
    """Computes the loss for the AAC algorithm.

    Adapted from https://github.com/awjuliani/DeepRL-Agents.

    Args:
        actions: The actual actions that the agent took.
        rewards: The reward obtained at each time step.
        policy: A differentiable (mu, sigma) tuple with the generated policy.
        values: A differentiable vector with the state-value estimates.
        bootstrap: The estimated future rewards after the final step.
        value_weight: Value loss weight (how close it is to target_values).
        entropy_weight: Entropy weight, for exploration encouragement.
        gamma: The discount factor for the reward and GAE.
        epsilon: Numerical stabilization constant for the entropy logarithm.

    Returns:
        - A differentiable scalar with the total loss for the batch.
        - An op with the merged summaries that have been created.
    """
    with tf.name_scope('GeneralizedAdvantageEstimation'):
        # Compute the target values (or discounted rewards) bootstrapping with
        # the estimated future returns (or value) of the last state.
        bootstrap = tf.expand_dims(bootstrap, 0)
        episode_length = tf.shape(values)[0]
        rewards_plus = tf.concat([rewards, bootstrap], axis=0)
        target_values = tf.stop_gradient(
                discount(rewards_plus, gamma)[:episode_length])
        # Compute the advantages using Generalized Advantage Estimation.
        values_plus = tf.concat([values[1:], bootstrap], axis=0)
        advantages = tf.stop_gradient(
                discount(rewards + gamma * values_plus - values, gamma))
    # The differential entropy (valid for continuous policies) of a normal
    # distribution, after dropping constants, is the log of the determinant of
    # the covariance, which is the sum of the log of the variance of each
    # dimension for a diagonal coavariance matrix.
    with tf.name_scope('Entropy'):
        entropy = tf.reduce_sum(tf.log(policy.sigma + epsilon), axis=1)
        entropy_loss = -tf.reduce_sum(entropy)
    with tf.name_scope('PolicyLoss'):
        # Squared Mahalanobis Distance between predicted and sampled actions.
        squared_difference = tf.squared_difference(actions, policy.mu)
        squared_distance = tf.reduce_sum(
                squared_difference / (policy.sigma + epsilon), axis=1)
        # The log of the policy density for the taken action, over all samples.
        policy_loss = -tf.reduce_sum((entropy + squared_distance) * advantages)
    with tf.name_scope('ValueLoss'):
        # Value loss: the critic tries to predict the discounted reward.
        value_loss = tf.reduce_sum(
                tf.squared_difference(target_values, values))
    with tf.name_scope('TotalLoss'):
        loss = (policy_loss
                + value_weight * value_loss
                + entropy_weight * entropy_loss)
    with tf.name_scope('Summaries'):
        float_length = tf.to_float(episode_length)
        loss_summaries = [
            tf.summary.scalar('Value', value_loss / float_length),
            tf.summary.scalar('Policy', policy_loss / float_length),
            tf.summary.scalar('Entropy', -entropy_loss / float_length),
            tf.summary.scalar('Total', loss / float_length)]
    return loss, tf.summary.merge(loss_summaries)
	import numpy as np
	import scipy.signal
	import tensorflow as tf
	import tensorify


	@tensorify.tensorflow_op(tf.float32)
	def discount(x, gamma=1.0, axis=0):
	y = scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=axis)[::-1]
	return y.astype(np.float32)


	def AdvantageActorCriticLoss(actions, rewards, policy, values, bootstrap,
	value_weight=0.5, entropy_weight=1e-4,
	gamma=0.9, epsilon=1e-10):
	"""Computes the loss for the AAC algorithm.

	Adapted from https://github.com/awjuliani/DeepRL-Agents.

	Args:
	actions: The actual actions that the agent took.
	rewards: The reward obtained at each time step.
	policy: A differentiable (mu, sigma) tuple with the generated policy.
	values: A differentiable vector with the state-value estimates.
	bootstrap: The estimated future rewards after the final step.
	value_weight: Value loss weight (how close it is to target_values).
	entropy_weight: Entropy weight, for exploration encouragement.
	gamma: The discount factor for the reward and GAE.
	epsilon: Numerical stabilization constant for the entropy logarithm.

	Returns:
	- A differentiable scalar with the total loss for the batch.
	- An op with the merged summaries that have been created.
	"""
	with tf.name_scope('GeneralizedAdvantageEstimation'):
	# Compute the target values (or discounted rewards) bootstrapping with
	# the estimated future returns (or value) of the last state.
	bootstrap = tf.expand_dims(bootstrap, 0)
	episode_length = tf.shape(values)[0]
	rewards_plus = tf.concat([rewards, bootstrap], axis=0)
	target_values = tf.stop_gradient(
	discount(rewards_plus, gamma)[:episode_length])
	# Compute the advantages using Generalized Advantage Estimation.
	values_plus = tf.concat([values[1:], bootstrap], axis=0)
	advantages = tf.stop_gradient(
	discount(rewards + gamma * values_plus - values, gamma))
	# The differential entropy (valid for continuous policies) of a normal
	# distribution, after dropping constants, is the log of the determinant of
	# the covariance, which is the sum of the log of the variance of each
	# dimension for a diagonal coavariance matrix.
	with tf.name_scope('Entropy'):
	entropy = tf.reduce_sum(tf.log(policy.sigma + epsilon), axis=1)
	entropy_loss = -tf.reduce_sum(entropy)
	with tf.name_scope('PolicyLoss'):
	# Squared Mahalanobis Distance between predicted and sampled actions.
	squared_difference = tf.squared_difference(actions, policy.mu)
	squared_distance = tf.reduce_sum(
	squared_difference / (policy.sigma + epsilon), axis=1)
	# The log of the policy density for the taken action, over all samples.
	policy_loss = -tf.reduce_sum((entropy + squared_distance) * advantages)
	with tf.name_scope('ValueLoss'):
	# Value loss: the critic tries to predict the discounted reward.
	value_loss = tf.reduce_sum(
	tf.squared_difference(target_values, values))
	with tf.name_scope('TotalLoss'):
	loss = (policy_loss
	+ value_weight * value_loss
	+ entropy_weight * entropy_loss)
	with tf.name_scope('Summaries'):
	float_length = tf.to_float(episode_length)
	loss_summaries = [
	tf.summary.scalar('Value', value_loss / float_length),
	tf.summary.scalar('Policy', policy_loss / float_length),
	tf.summary.scalar('Entropy', -entropy_loss / float_length),
	tf.summary.scalar('Total', loss / float_length)]
	return loss, tf.summary.merge(loss_summaries)