Created
June 13, 2018 10:36
-
-
Save eblancoh/625f99a75bd8c851364899705fbadf41 to your computer and use it in GitHub Desktop.
EpsilonGreedy.py Gist for Deep Learning vs Atari: entrena tu IA para dominar videojuegos clásicos article series
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class EpsilonGreedy: | |
""" | |
The epsilon-greedy policy either takes a random action with | |
probability epsilon, or it takes the action for the highest | |
Q-value. | |
If epsilon is 1.0 then the actions are always random. | |
If epsilon is 0.0 then the actions are always argmax for the Q-values. | |
Epsilon is typically decreased linearly from 1.0 to 0.1 during training | |
and this is also implemented in this class. | |
During testing, epsilon is usually chosen lower, e.g. 0.05 or 0.01 | |
""" | |
def __init__(self, num_actions, epsilon_testing=0.05, num_iterations=1e6, | |
start_value=1.0, end_value=0.1, repeat=False): | |
""" | |
:param num_actions: | |
Number of possible actions in the game-environment. | |
:param epsilon_testing: | |
Epsilon-value when testing. | |
:param num_iterations: | |
Number of training iterations required to linearly | |
decrease epsilon from start_value to end_value. | |
:param start_value: | |
Starting value for linearly decreasing epsilon. | |
:param end_value: | |
Ending value for linearly decreasing epsilon. | |
:param repeat: | |
Boolean whether to repeat and restart the linear decrease | |
when the end_value is reached, or only do it once and then | |
output the end_value forever after. | |
""" | |
# Store parameters. | |
self.num_actions = num_actions | |
self.epsilon_testing = epsilon_testing | |
# Create a control signal for linearly decreasing epsilon. | |
self.epsilon_linear = LinearControlSignal(num_iterations=num_iterations, | |
start_value=start_value, | |
end_value=end_value, | |
repeat=repeat) | |
def get_epsilon(self, iteration, training): | |
""" | |
Return the epsilon for the given iteration. | |
If training==True then epsilon is linearly decreased, | |
otherwise epsilon is a fixed number. | |
""" | |
if training: | |
epsilon = self.epsilon_linear.get_value(iteration=iteration) | |
else: | |
epsilon = self.epsilon_testing | |
return epsilon | |
def get_action(self, q_values, iteration, training): | |
""" | |
Use the epsilon-greedy policy to select an action. | |
:param q_values: | |
These are the Q-values that are estimated by the Neural Network | |
for the current state of the game-environment. | |
:param iteration: | |
This is an iteration counter. Here we use the number of states | |
that has been processed in the game-environment. | |
:param training: | |
Boolean whether we are training or testing the | |
Reinforcement Learning agent. | |
:return: | |
action (integer), epsilon (float) | |
""" | |
epsilon = self.get_epsilon(iteration=iteration, training=training) | |
# With probability epsilon. | |
if np.random.random() < epsilon: | |
# Select a random action. | |
action = np.random.randint(low=0, high=self.num_actions) | |
else: | |
# Otherwise select the action that has the highest Q-value. | |
action = np.argmax(q_values) | |
return action, epsilon | |
class LinearControlSignal: | |
""" | |
A control signal that changes linearly over time. | |
This is used to change e.g. the learning-rate for the optimizer | |
of the Neural Network, as well as other parameters. | |
TensorFlow has functionality for doing this, but it uses the | |
global_step counter inside the TensorFlow graph, while we | |
want the control signals to use a state-counter for the | |
game-environment. So it is easier to make this in Python. | |
""" | |
def __init__(self, start_value, end_value, num_iterations, repeat=False): | |
""" | |
Create a new object. | |
:param start_value: | |
Start-value for the control signal. | |
:param end_value: | |
End-value for the control signal. | |
:param num_iterations: | |
Number of iterations it takes to reach the end_value | |
from the start_value. | |
:param repeat: | |
Boolean whether to reset the control signal back to the start_value | |
after the end_value has been reached. | |
""" | |
# Store arguments in this object. | |
self.start_value = start_value | |
self.end_value = end_value | |
self.num_iterations = num_iterations | |
self.repeat = repeat | |
# Calculate the linear coefficient. | |
self._coefficient = (end_value - start_value) / num_iterations | |
def get_value(self, iteration): | |
"""Get the value of the control signal for the given iteration.""" | |
if self.repeat: | |
iteration %= self.num_iterations | |
if iteration < self.num_iterations: | |
value = iteration * self._coefficient + self.start_value | |
else: | |
value = self.end_value | |
return value |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment