eblancoh/Q_function.py

## Q_function.py
class ReplayMemory:

    ...

    def update_all_q_values(self):
        """
        Update all Q-values in the replay-memory.

        When states and Q-values are added to the replay-memory, the
        Q-values have been estimated by the Neural Network. But we now
        have more data available that we can use to improve the estimated
        Q-values, because we now know which actions were taken and the
        observed rewards. We sweep backwards through the entire replay-memory
        to use the observed data to improve the estimated Q-values.
        """

        # Copy old Q-values so we can print their statistics later.
        # Note that the contents of the arrays are copied.
        self.q_values_old[:] = self.q_values[:]

        # Process the replay-memory backwards and update the Q-values.
        # This loop could be implemented entirely in NumPy for higher speed,
        # but it is probably only a small fraction of the overall time usage,
        # and it is much easier to understand when implemented like this.
        for k in reversed(range(self.num_used - 1)):
            # Get the data for the k'th state in the replay-memory.
            action = self.actions[k]
            reward = self.rewards[k]
            end_life = self.end_life[k]
            end_episode = self.end_episode[k]

            # Calculate the Q-value for the action that was taken in this state.
            if end_life or end_episode:
                # If the agent lost a life or it was game over / end of episode,
                # then the value of taking the given action is just the reward
                # that was observed in this single step. This is because the
                # Q-value is defined as the discounted value of all future game
                # steps in a single life of the agent. When the life has ended,
                # there will be no future steps.
                action_value = reward
            else:
                # Otherwise the value of taking the action is the reward that
                # we have observed plus the discounted value of future rewards
                # from continuing the game. We use the estimated Q-values for
                # the following state and take the maximum, because we will
                # generally take the action that has the highest Q-value.
                action_value = reward + self.discount_factor * np.max(self.q_values[k + 1])

            # Error of the Q-value that was estimated using the Neural Network.
            self.estimation_errors[k] = abs(action_value - self.q_values[k, action])

            # Update the Q-value with the better estimate.
            self.q_values[k, action] = action_value
	class ReplayMemory:

	...

	def update_all_q_values(self):
	"""
	Update all Q-values in the replay-memory.

	When states and Q-values are added to the replay-memory, the
	Q-values have been estimated by the Neural Network. But we now
	have more data available that we can use to improve the estimated
	Q-values, because we now know which actions were taken and the
	observed rewards. We sweep backwards through the entire replay-memory
	to use the observed data to improve the estimated Q-values.
	"""

	# Copy old Q-values so we can print their statistics later.
	# Note that the contents of the arrays are copied.
	self.q_values_old[:] = self.q_values[:]

	# Process the replay-memory backwards and update the Q-values.
	# This loop could be implemented entirely in NumPy for higher speed,
	# but it is probably only a small fraction of the overall time usage,
	# and it is much easier to understand when implemented like this.
	for k in reversed(range(self.num_used - 1)):
	# Get the data for the k'th state in the replay-memory.
	action = self.actions[k]
	reward = self.rewards[k]
	end_life = self.end_life[k]
	end_episode = self.end_episode[k]

	# Calculate the Q-value for the action that was taken in this state.
	if end_life or end_episode:
	# If the agent lost a life or it was game over / end of episode,
	# then the value of taking the given action is just the reward
	# that was observed in this single step. This is because the
	# Q-value is defined as the discounted value of all future game
	# steps in a single life of the agent. When the life has ended,
	# there will be no future steps.
	action_value = reward
	else:
	# Otherwise the value of taking the action is the reward that
	# we have observed plus the discounted value of future rewards
	# from continuing the game. We use the estimated Q-values for
	# the following state and take the maximum, because we will
	# generally take the action that has the highest Q-value.
	action_value = reward + self.discount_factor * np.max(self.q_values[k + 1])

	# Error of the Q-value that was estimated using the Neural Network.
	self.estimation_errors[k] = abs(action_value - self.q_values[k, action])

	# Update the Q-value with the better estimate.
	self.q_values[k, action] = action_value