icoxfog417/actor_critic_example_imp.py

## actor_critic_example_imp.py
def update(self, states, actions, rewards):
    values = self.critic(states)
    advantage = reward - tf.stop_gradient(values)  # Prevent gradient flows to critic
    action_probs = self.actor(states)
    selected_action_probs = action_probs[self.to_one_hot(actions)]
    neg_logs = - log(selected_action_probs)
    policy_loss = reduce_mean(neg_logs * advantages)
	def update(self, states, actions, rewards):
	values = self.critic(states)
	advantage = reward - tf.stop_gradient(values) # Prevent gradient flows to critic
	action_probs = self.actor(states)
	selected_action_probs = action_probs[self.to_one_hot(actions)]
	neg_logs = - log(selected_action_probs)
	policy_loss = reduce_mean(neg_logs * advantages)