horoiwa/awr.py

## awr.py
    def update_policy(self, states, actions):
        """ Advantage weighted regression
        """
        q1, q2 = self.target_qnet(states, actions)
        Q = tf.minimum(q1, q2)
        V = self.valuenet(states)

        exp_Adv = tf.minimum(tf.exp((Q - V) * self.temperature), 100.0)

        with tf.GradientTape() as tape:
            dists = self.policy(states)
            log_probs = tf.reshape(dists.log_prob(actions), (-1, 1))
            loss = tf.reduce_mean(-1 * (exp_Adv * log_probs))

        variables = self.policy.trainable_variables
        grads = tape.gradient(loss, variables)
        self.p_optimizer.apply_gradients(zip(grads, variables))

        return loss
	def update_policy(self, states, actions):
	""" Advantage weighted regression
	"""
	q1, q2 = self.target_qnet(states, actions)
	Q = tf.minimum(q1, q2)
	V = self.valuenet(states)

	exp_Adv = tf.minimum(tf.exp((Q - V) * self.temperature), 100.0)

	with tf.GradientTape() as tape:
	dists = self.policy(states)
	log_probs = tf.reshape(dists.log_prob(actions), (-1, 1))
	loss = tf.reduce_mean(-1 * (exp_Adv * log_probs))

	variables = self.policy.trainable_variables
	grads = tape.gradient(loss, variables)
	self.p_optimizer.apply_gradients(zip(grads, variables))

	return loss