Skip to content

Instantly share code, notes, and snippets.

@eblancoh
Created June 13, 2018 10:36
Show Gist options
  • Save eblancoh/625f99a75bd8c851364899705fbadf41 to your computer and use it in GitHub Desktop.
Save eblancoh/625f99a75bd8c851364899705fbadf41 to your computer and use it in GitHub Desktop.
EpsilonGreedy.py Gist for Deep Learning vs Atari: entrena tu IA para dominar videojuegos clásicos article series
class EpsilonGreedy:
"""
The epsilon-greedy policy either takes a random action with
probability epsilon, or it takes the action for the highest
Q-value.
If epsilon is 1.0 then the actions are always random.
If epsilon is 0.0 then the actions are always argmax for the Q-values.
Epsilon is typically decreased linearly from 1.0 to 0.1 during training
and this is also implemented in this class.
During testing, epsilon is usually chosen lower, e.g. 0.05 or 0.01
"""
def __init__(self, num_actions, epsilon_testing=0.05, num_iterations=1e6,
start_value=1.0, end_value=0.1, repeat=False):
"""
:param num_actions:
Number of possible actions in the game-environment.
:param epsilon_testing:
Epsilon-value when testing.
:param num_iterations:
Number of training iterations required to linearly
decrease epsilon from start_value to end_value.
:param start_value:
Starting value for linearly decreasing epsilon.
:param end_value:
Ending value for linearly decreasing epsilon.
:param repeat:
Boolean whether to repeat and restart the linear decrease
when the end_value is reached, or only do it once and then
output the end_value forever after.
"""
# Store parameters.
self.num_actions = num_actions
self.epsilon_testing = epsilon_testing
# Create a control signal for linearly decreasing epsilon.
self.epsilon_linear = LinearControlSignal(num_iterations=num_iterations,
start_value=start_value,
end_value=end_value,
repeat=repeat)
def get_epsilon(self, iteration, training):
"""
Return the epsilon for the given iteration.
If training==True then epsilon is linearly decreased,
otherwise epsilon is a fixed number.
"""
if training:
epsilon = self.epsilon_linear.get_value(iteration=iteration)
else:
epsilon = self.epsilon_testing
return epsilon
def get_action(self, q_values, iteration, training):
"""
Use the epsilon-greedy policy to select an action.
:param q_values:
These are the Q-values that are estimated by the Neural Network
for the current state of the game-environment.
:param iteration:
This is an iteration counter. Here we use the number of states
that has been processed in the game-environment.
:param training:
Boolean whether we are training or testing the
Reinforcement Learning agent.
:return:
action (integer), epsilon (float)
"""
epsilon = self.get_epsilon(iteration=iteration, training=training)
# With probability epsilon.
if np.random.random() < epsilon:
# Select a random action.
action = np.random.randint(low=0, high=self.num_actions)
else:
# Otherwise select the action that has the highest Q-value.
action = np.argmax(q_values)
return action, epsilon
class LinearControlSignal:
"""
A control signal that changes linearly over time.
This is used to change e.g. the learning-rate for the optimizer
of the Neural Network, as well as other parameters.
TensorFlow has functionality for doing this, but it uses the
global_step counter inside the TensorFlow graph, while we
want the control signals to use a state-counter for the
game-environment. So it is easier to make this in Python.
"""
def __init__(self, start_value, end_value, num_iterations, repeat=False):
"""
Create a new object.
:param start_value:
Start-value for the control signal.
:param end_value:
End-value for the control signal.
:param num_iterations:
Number of iterations it takes to reach the end_value
from the start_value.
:param repeat:
Boolean whether to reset the control signal back to the start_value
after the end_value has been reached.
"""
# Store arguments in this object.
self.start_value = start_value
self.end_value = end_value
self.num_iterations = num_iterations
self.repeat = repeat
# Calculate the linear coefficient.
self._coefficient = (end_value - start_value) / num_iterations
def get_value(self, iteration):
"""Get the value of the control signal for the given iteration."""
if self.repeat:
iteration %= self.num_iterations
if iteration < self.num_iterations:
value = iteration * self._coefficient + self.start_value
else:
value = self.end_value
return value
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment