Skip to content

Instantly share code, notes, and snippets.

@kschoos
Last active February 8, 2019 18:58
Show Gist options
  • Save kschoos/f3e96edbbe914b0f720f44b0e46326c9 to your computer and use it in GitHub Desktop.
Save kschoos/f3e96edbbe914b0f720f44b0e46326c9 to your computer and use it in GitHub Desktop.
import stuff
def identity_loss(y_true, y_pred):
return y_pred
class TensorBoardWrap(TensorBoard):
def __init__(self, val_data, **args):
TensorBoard.__init__(self, **args)
self.validation_data = val_data
def on_epoch_end(self, epoch, logs=None):
TensorBoard.on_epoch_end(self, epoch, logs)
class SumAcross(Layer):
def __init__(self, axis, **kwargs):
super(SumAcross, self).__init__(**kwargs)
self.axis = axis
def call(self, inputs):
return K.sum(inputs, axis=self.axis)
def get_config(self):
config = {'axis': self.axis }
base_config = super(SumAcross, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def compute_output_shape(self, input_shape):
return tuple(list(input_shape[:-1]) + [1])
class ClippedLoss(Layer):
def __init__(self, delta_clip, **kwargs):
super(ClippedLoss, self).__init__(**kwargs)
self.delta_clip = delta_clip
def call(self, inputs):
return huber_loss(inputs[0], inputs[1], self.delta_clip)
def get_config(self):
config = {'delta_clip': self.delta_clip}
base_config = super(ClippedLoss, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def compute_output_shape(self, input_shape):
return input_shape[0]
def mean_q(y_true, y_pred):
return K.mean(K.max(y_pred, axis=-1))
class EpsGreedyRestrictedPolicy(Policy):
"""Implement the epsilon greedy policy
Eps Greedy policy either:
- takes a random action with probability epsilon
- takes current best action with prob (1 - epsilon)
- Only takes aloud actions
"""
def __init__(self, eps=.1):
super(EpsGreedyRestrictedPolicy, self).__init__()
self.eps = eps
def select_action(self, q_values, legal_actions):
"""Return the selected action
# Arguments
q_values (np.ndarray): List of the estimations of Q for each action
# Returns
Selection action
"""
assert q_values.ndim == 1
nb_actions = q_values.shape[0]
if np.random.uniform() < self.eps:
return np.random.choice(legal_actions)
# return np.random.random_integers(0, nb_actions-1)
else:
sorted_indices_decreasing = np.argsort(q_values)[::-1]
for a in sorted_indices_decreasing:
if a in legal_actions:
return a
return None
def get_config(self):
"""Return configurations of EpsGreedyQPolicy
# Returns
Dict of config
"""
config = super(EpsGreedyRestrictedPolicy, self).get_config()
config['eps'] = self.eps
return config
class GreedyRestrictedPolicy(Policy):
"""Implement the greedy policy
Greedy policy returns the current best action according to q_values
"""
def select_action(self, q_values, legal_actions):
"""Return the selected action
# Arguments
q_values (np.ndarray): List of the estimations of Q for each action
# Returns
Selection action
"""
assert q_values.ndim == 1
sorted_indices_decreasing = np.argsort(q_values)[::-1]
for a in sorted_indices_decreasing:
if a in legal_actions:
return a
return None
class PACQNAgent(PacmanQAgent):
"""
ApproximateQLearningAgent
You should only have to overwrite getQValue
and update. All other QLearningAgent functions
should work as is.
"""
def generate_filenames(self):
self.model_weights = self.filename_generator("model_weights", "h5")
self.trainable_model_weights = self.filename_generator("trainable_model_weights", "h5")
self.target_model_weights = self.filename_generator("target_model_weights", "h5")
self.model_file = self.filename_generator("model", "h5")
self.trainable_model_file = self.filename_generator("trainable_model", "h5")
self.target_model_file = self.filename_generator("target_model", "h5")
self.parameters_file = self.filename_generator("params", "pkl")
self.memory_file = self.filename_generator("memory", "pkl")
self.version_file = self.path + "v.pkl"
PacmanQAgent.generate_filenames(self)
def __init__(self, decay, ghosts, **args):
# Next, we build our model. We use the same model that was described by Mnih et al. (2015).
# input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE
PacmanQAgent.__init__(self, **args)
self.nb_max_start_steps = 15
WINDOW_LENGTH = 2
CHANNELS = 4
sys.setrecursionlimit(10000)
signal.signal(signal.SIGINT, self.cleanup)
self.input_shape = (WINDOW_LENGTH*CHANNELS,) + (args['layout'].width, args['layout'].height)
self.model = Sequential()
self.nb_actions = 5
self.step = 0
self.nb_steps_warmup = 50
self.train_interval = 1
self.batch_size = 16
self.memory_interval = 1
self.metrics_names = []
self.enable_double_dqn = False
self.gamma = 0.95
self.target_model = None
self.trainable_model = None
self.training = self.numTraining > 0
self.test_policy = GreedyRestrictedPolicy()
self.recent_action = None
self.recent_observation = None
self.target_model_update = 1
self.processor = None
self.accumulated_reward = 0
self.nb_of_times_to_repeat_action = 1
self.action_to_repeat = None
self.custom_model_objects = {}
self.delta_clip = np.inf
self.nb_random_start_steps = 0
self.startEpsilon = args['startEpsilon']
self.endEpsilon = args['endEpsilon']
self.stepsThisEpisode = 0
self.nb_episodes_between_backups = 2500
self.log_interval = 10000
self.epoch_metrics = []
self.epoch_rewards = []
self.global_epoch = 0
self.start_food = np.sum(args['layout'].food.data)
### SETUP STUFF WITH NO DIRECT RELEVANCE ########################################
self.generate_filenames()
log_dir = self.path + "logs/"
start_state = GameState()
start_state.initialize(args['layout'], ghosts)
start_state = np.reshape(np.array(2*[start_state.data.asMultipleArrays()]), (1,) + self.input_shape)
print(np.shape(start_state))
val_data = [start_state, np.zeros((1, self.nb_actions)), np.ones((1, self.nb_actions)), np.zeros((1, 1)), np.zeros((1, 1)), np.ones((1,)), np.ones((1,))]
print(val_data[0].shape[0])
tb = TensorBoardWrap(val_data=val_data, log_dir=log_dir, write_graph=True, write_grads=True, histogram_freq=100)
self.callbacks = []
self.callbacks += [tb]
self.callbacks += [TrainIntervalLogger(interval=self.log_interval)]
self.callbacks += [TrainEpisodeLogger()]
params = {
'nb_steps': self.numTraining,
}
self.callbacks = CallbackList(callbacks=self.callbacks)
if hasattr(self.callbacks, 'set_params'):
self.callbacks.set_params(params)
else:
self.callbacks._set_params(params)
self.ipt = Input(shape=self.input_shape)
self.permute = Permute((2, 3, 1), input_shape=self.input_shape)(self.ipt)
self.c1 = Convolution2D(32, (3, 3), strides=(1, 1))(self.permute)
self.a1 = Activation('relu')(self.c1)
self.c2 = Convolution2D(64, (3, 3), strides=(1, 1))(self.a1)
self.a2 = Activation('relu')(self.c2)
self.c3 = Convolution2D(64, (3, 3), strides=(1, 1))(self.a2)
self.a3 = Activation('relu')(self.c3)
self.flat = Flatten()(self.a3)
self.dense = Dense(self.nb_actions)(self.flat)
self.out = Activation('linear')(self.dense)
self.model = Model(inputs=self.ipt, outputs=self.out)
print(self.model.summary())
# Find latest version
if os.path.isfile(self.version_file):
with open(self.version_file, "rb") as ipt:
self.sub_version = pickle.load(ipt)
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
if os.path.isfile(self.memory_file(self.sub_version)):
with open(self.memory_file(self.sub_version), "rb") as input:
self.memory = pickle.load(input)
print("Loaded previous memory successfully")
else:
self.memory = SequentialMemory(limit=300000, window_length=WINDOW_LENGTH)
print("Creating memory from scratch")
if os.path.isfile(self.parameters_file(self.sub_version)):
with open(self.parameters_file(self.sub_version), "rb") as input:
self.step = pickle.load(input)
self.sub_version = pickle.load(input)
self.global_epoch = pickle.load(input)
print("Loaded step value, sub version and global epoch.")
# processor = AtariProcessor()
# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
# This is actually the EpsGreedy Restricted Policy (see top of this gist)
self.policy = LinearAnnealedPolicy(EpsGreedyRestrictedPolicy(), attr='eps', value_max=self.startEpsilon, value_min=self.endEpsilon, value_test=.05,
nb_steps=decay)
self.policy._set_agent(self)
if os.path.isfile(self.model_file(self.sub_version)) and \
os.path.isfile(self.target_model_file(self.sub_version)) and \
os.path.isfile(self.trainable_model_file(self.sub_version)):
custom_objects = {'ClippedLoss': ClippedLoss, 'identity_loss': identity_loss, 'SumAcross': SumAcross, 'mean_q': mean_q}
self.model = load_model(self.model_file(self.sub_version), custom_objects=custom_objects)
self.trainable_model = load_model(self.trainable_model_file(self.sub_version), custom_objects=custom_objects)
self.target_model = load_model(self.target_model_file(self.sub_version), custom_objects=custom_objects)
else:
if os.path.isfile(self.model_weights(self.sub_version)):
print("Loading model weights...")
self.model.load_weights(self.model_weights(self.sub_version))
self.compile(Adam(lr=.00025, clipnorm=1., clipvalue=0.5), metrics=['mae'])
self.model.name = "MainModel"
self.trainable_model.name = "TrainableModel"
self.target_model.name = "TargetModel"
print(self.model.get_weights())
self.callbacks.set_model(self.trainable_model)
print(self.model.summary())
self.train_begin()
def load_weights(self, filepath):
self.model.load_weights(filepath)
self.update_target_model_hard()
self.update_trainable_model_hard()
def update_target_model_hard(self):
self.target_model.set_weights(self.model.get_weights())
def update_trainable_model_hard(self):
self.trainable_model.set_weights(self.model.get_weights())
def compileTrainableModel(self, optimizer, metrics=[]):
y_pred = self.model.output
y_true = Input(name='y_true', shape=(self.nb_actions,))
mask = Input(name='mask', shape=(self.nb_actions,))
clipped = ClippedLoss(delta_clip=self.delta_clip)([y_true, y_pred])
clipped.trainable = False
masked = Multiply()([clipped, mask])
masked.trainable = False
loss = SumAcross(axis=-1, name='loss')(masked)
loss.trainable = False
ins = [self.model.input] if type(self.model.input) is not list else self.model.input
trainable_model = Model(inputs=ins + [y_true, mask], outputs=[loss, y_pred])
combined_metrics = {trainable_model.output_names[1]: metrics}
# tf.summary.scalar('trainable_model.loss', trainable_model.output[0])
trainable_model.compile(optimizer=optimizer, loss=identity_loss, metrics=combined_metrics)
# tf.summary.merge_all()
if os.path.isfile(self.trainable_model_weights(self.sub_version)):
trainable_model.load_weights(self.trainable_model_weights(self.sub_version))
return trainable_model
def compile(self, optimizer, metrics=[]):
metrics += [mean_q] # register default metrics
# We never train the target model, hence we can set the optimizer and loss arbitrarily.
self.target_model = clone_model(self.model, self.custom_model_objects)
if os.path.isfile(self.target_model_weights(self.sub_version)):
self.target_model.load_weights(self.target_model_weights)
self.target_model.compile(optimizer='sgd', loss='mse')
self.model.compile(optimizer='sgd', loss='mse')
# Compile model.
if self.target_model_update < 1.:
# We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model.
updates = get_soft_target_model_updates(self.target_model, self.model, self.target_model_update)
optimizer = AdditionalUpdatesOptimizer(optimizer, updates)
# Create trainable model. The problem is that we need to mask the output since we only
# ever want to update the Q values for a certain action. The way we achieve this is by
# using a custom Lambda layer that computes the loss. This gives us the necessary flexibility
# to mask out certain parameters by passing in multiple inputs to the Lambda layer.
self.trainable_model = self.compileTrainableModel(optimizer, metrics)
self.compiled = True
def update_target_model_hard(self):
self.target_model.set_weights(self.model.get_weights())
def startTesting(self):
self.training = False
def getWeights(self):
return self.weights
def getQValue(self, state, action):
"""
Should return Q(state,action) = w * featureVector
where * is thedotProduct operator
"""
if not self.init:
print(self.featExtractor.getFeatures(state, action))
self.init +=1
sum = 0
features = self.featExtractor.getFeatures(state, action)
for _, val in enumerate(self.featExtractor.getFeatures(state, action)):
sum += self.weights[val] * features[val]
return sum
def update(self, state, action, nextState, reward):
"""
Should update your weights based on transition
"""
"*** YOUR CODE HERE ***"
PacmanQAgent.update(self, state, action, nextState, reward)
self.accumulated_reward += reward
# if self.step % self.nb_of_times_to_repeat_action == 0 and self.training:
self.backward(reward, nextState is None, state)
self.increment_step()
def increment_step(self):
self.step += 1
self.stepsThisEpisode += 1
def process_observation(self, observation):
return observation.data.asMultipleArrays()
def process_state_batch(self, batch):
batch = np.array(batch)
return batch
def compute_batch_q_values(self, state_batch):
batch = self.process_state_batch(state_batch)
# print(batch)
q_values = self.model.predict_on_batch(batch)
assert q_values.shape == (len(state_batch), self.nb_actions)
return q_values
def compute_q_values(self, state):
q_values = self.compute_batch_q_values([state]).flatten()
assert q_values.shape == (self.nb_actions,)
return q_values
def forward(self, state):
# Select an action.
observation = self.process_observation(state)
recent_state = self.memory.get_recent_state(observation)
recent_state = np.reshape(recent_state, self.input_shape)
q_values = self.compute_q_values(recent_state)
if self.training:
legalActions = self.getLegalActions(state)
action = self.policy.select_action(q_values=q_values, legal_actions=Actions.actionsAsIndices(legalActions))
else:
action = self.test_policy.select_action(q_values=q_values, legal_actions=Actions.actionsAsIndices(self.getLegalActions(state)))
# print("Saved weights debug data to 'during' file")
# Book-keeping.
self.recent_observation = observation
self.recent_action = action
return action
def backward(self, reward, terminal, state=None):
# Store most recent experience in memory.
if self.step % self.memory_interval == 0:
self.memory.append(self.recent_observation, self.recent_action, reward, terminal,
training=self.training)
metrics = [np.nan for _ in self.metrics_names]
if not self.training:
# We're done here. No need to update the experience memory since we only use the working
# memory to obtain the state over the most recent observations.
return metrics
# Train the network on a single stochastic batch.
try:
if self.step > self.nb_steps_warmup and self.step % self.train_interval == 0:
experiences = self.memory.sample(self.batch_size)
assert len(experiences) == self.batch_size
# Start by extracting the necessary parameters (we use a vectorized implementation).
state0_batch = []
reward_batch = []
action_batch = []
terminal1_batch = []
state1_batch = []
for e in experiences:
state0_batch.append(e.state0)
state1_batch.append(e.state1)
reward_batch.append(e.reward)
action_batch.append(e.action)
terminal1_batch.append(0. if e.terminal1 else 1.)
# Prepare and validate parameters.
state0_batch = self.process_state_batch(state0_batch)
state1_batch = self.process_state_batch(state1_batch)
terminal1_batch = np.array(terminal1_batch)
reward_batch = np.array(reward_batch)
assert reward_batch.shape == (self.batch_size,)
assert terminal1_batch.shape == reward_batch.shape
assert len(action_batch) == len(reward_batch)
state0_batch = np.reshape(state0_batch, (self.batch_size,) + self.input_shape)
state1_batch = np.reshape(state1_batch, (self.batch_size,) + self.input_shape)
# Compute Q values for mini-batch update.
if self.enable_double_dqn:
# According to the paper "Deep Reinforcement Learning with Double Q-learning"
# (van Hasselt et al., 2015), in Double DQN, the online network predicts the actions
# while the target network is used to estimate the Q value.
q_values = self.model.predict_on_batch(state1_batch)
assert q_values.shape == (self.batch_size, self.nb_actions)
actions = np.argmax(q_values, axis=1)
assert actions.shape == (self.batch_size,)
# Now, estimate Q values using the target network but select the values with the
# highest Q value wrt to the online model (as computed above).
target_q_values = self.target_model.predict_on_batch(state1_batch)
assert target_q_values.shape == (self.batch_size, self.nb_actions)
q_batch = target_q_values[range(self.batch_size), actions]
else:
# Compute the q_values given state1, and extract the maximum for each sample in the batch.
# We perform this prediction on the target_model instead of the model for reasons
# outlined in Mnih (2015). In short: it makes the algorithm more stable.
target_q_values = self.target_model.predict_on_batch(state1_batch)
assert target_q_values.shape == (self.batch_size, self.nb_actions)
q_batch = np.max(target_q_values, axis=1).flatten()
assert q_batch.shape == (self.batch_size,)
targets = np.zeros((self.batch_size, self.nb_actions))
dummy_targets = np.zeros((self.batch_size,))
masks = np.zeros((self.batch_size, self.nb_actions))
# Compute r_t + gamma * max_a Q(s_t+1, a) and update the target targets accordingly,
# but only for the affected output units (as given by action_batch).
discounted_reward_batch = self.gamma * q_batch
# Set discounted reward to zero for all states that were terminal.
discounted_reward_batch *= terminal1_batch
assert discounted_reward_batch.shape == reward_batch.shape
Rs = reward_batch + discounted_reward_batch
for idx, (target, mask, R, action) in enumerate(zip(targets, masks, Rs, action_batch)):
target[action] = R # update action with estimated accumulated reward
dummy_targets[idx] = R
mask[action] = 1. # enable loss for this specific action
targets = np.array(targets).astype('float32')
masks = np.array(masks).astype('float32')
# Finally, perform a single update on the entire batch. We use a dummy target since
# the actual loss is computed in a Lambda layer that needs more complex input. However,
# it is still useful to know the actual target to compute metrics properly.
ins = [state0_batch] if type(self.model.input) is not list else state0_batch
# self.callbacks.on_train_batch_begin(batch=ins)
metrics = self.trainable_model.train_on_batch(ins + [targets, masks], [dummy_targets, targets])
self.callbacks.on_train_batch_end(batch=ins, logs=metrics)
metrics = [metric for idx, metric in enumerate(metrics) if idx not in (1, 2)] # throw away individual losses
metrics += self.policy.metrics
d = dict(zip(self.trainable_model.metrics_names, metrics))
self.epoch_metrics += [d]
if self.processor is not None:
metrics += self.processor.metrics
if self.target_model_update >= 1 and self.step % self.target_model_update == 0:
self.update_target_model_hard()
except AssertionError:
pass
return metrics
def train_begin(self):
self.callbacks.on_train_begin()
def episode_begin(self):
self.callbacks.on_epoch_begin(self.global_epoch)
def episode_end(self):
logs = {}
if len(self.epoch_metrics) > 0:
logs = {key: np.mean([entry[key] for entry in self.epoch_metrics]) for key, value in self.epoch_metrics[0].items()}
more_logs = {"av_reward": np.mean(self.epoch_rewards), "eps": self.policy.get_current_value()}
self.epoch_rewards = []
self.callbacks.on_epoch_end(self.global_epoch, logs=dict(logs.items() + more_logs.items()))
self.global_epoch += 1
def train_end(self):
self.callbacks.on_train_end()
def getAction(self, state):
if self.stepsThisEpisode == 0:
self.episode_begin()
if self.nb_random_start_steps == 0:
self.nb_random_start_steps = np.random.randint(self.nb_max_start_steps)
if self.stepsThisEpisode < self.nb_random_start_steps:
action = np.random.choice(self.getLegalActions(state))
self.recent_observation = self.process_observation(state)
self.recent_action = Actions.actionsAsIndices([action])[0]
else:
action = self.forward(state)
action = Actions._possibleActions[action]
self.doAction(state, action)
return action
def saveEverything(self):
# Save the model
self.model.save(self.model_file(self.sub_version))
self.trainable_model.save(self.trainable_model_file(self.sub_version))
self.target_model.save(self.target_model_file(self.sub_version))
with open(self.memory_file(self.sub_version), "w") as output:
pickle.dump(self.memory, output, pickle.HIGHEST_PROTOCOL)
with open(self.parameters_file(self.sub_version), "w") as output:
pickle.dump(self.step, output, pickle.HIGHEST_PROTOCOL)
pickle.dump(self.sub_version, output, pickle.HIGHEST_PROTOCOL)
pickle.dump(self.global_epoch, output, pickle.HIGHEST_PROTOCOL)
with open(self.version_file, "w") as opt:
pickle.dump(self.sub_version, opt, pickle.HIGHEST_PROTOCOL)
# After saving the newest state, we delete the older state to save some space...
if self.sub_version > 0:
try:
os.remove(self.model_file(self.sub_version - 1))
os.remove(self.trainable_model_file(self.sub_version - 1))
os.remove(self.target_model_file(self.sub_version - 1))
os.remove(self.memory_file(self.sub_version - 1))
os.remove(self.parameters_file(self.sub_version - 1))
except:
print("Previous version was already deleted.")
pass
print("Saved Model, dumped memory and parameters to pickle file, version {}.".format(self.sub_version))
self.sub_version += 1
def cleanup(self, sig, frame):
if self.training:
self.training = False
self.saveEverything()
sys.exit(0)
else:
sys.exit(0)
def final(self, state):
"Called at the end of each game."
# call the super-class final method
PacmanQAgent.final(self, state)
self.nb_random_start_steps = 0
self.stepsThisEpisode = 0
self.epoch_rewards += [self.accumulated_reward]
self.accumulated_reward = 0
self.episode_end()
if self.episodesSoFar % self.nb_episodes_between_backups == 0:
self.saveEverything()
# print("Episode: {}".format(self.episodesSoFar))
# did we finish training?
if self.episodesSoFar == self.numTraining:
self.training = False
self.train_end()
self.saveEverything()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment