Skip to content

Instantly share code, notes, and snippets.

@kschoos
Last active March 7, 2019 15:50
Show Gist options
  • Save kschoos/fc3c85141908cc15eb02fda6dd2dcf4b to your computer and use it in GitHub Desktop.
Save kschoos/fc3c85141908cc15eb02fda6dd2dcf4b to your computer and use it in GitHub Desktop.
DQN
def asArray(state):
array <- zeros(state.width, state.length)
for x, y in state.width, state.length:
if x,y has food:
array[x][y] = 0.2
if x,y has wall:
array[x][y] = 0.1
if x,y has pacman:
array[x][y] = 0.5
if x,y has ghost:
array[x][y] = 0.7
if x,y has scared_ghost:
array[x][y] = 0.9
if x,y has capsule:
array[x][y] = 0.3
return array
class NewDQNAgent(ReinforcementAgent):
def loss(self, y_true, y_pred):
return self.huber_loss(y_true, y_pred)
def huber_loss(self, y_true, y_pred):
'''
Design Huber Loss according to wikipedia:
L(e) = 1/2 e^2 if |e| <= d, else d(|e| - 1/2d)
'''
error = tf.math.subtract(y_true, y_pred)
abs_error = tf.math.abs(error)
quadratic = tf.math.minimum(abs_error, self.huber_delta)
linear = tf.math.subtract(abs_error, quadratic)
losses = tf.math.add(
tf.math.multiply(
tf.constant(0.5,
dtype=quadratic.dtype),
tf.math.multiply(quadratic, quadratic)),
tf.math.multiply(
self.huber_delta,
linear
))
return losses
def process_state(self, state):
observation = state.data.asArray()
return observation
def generate_filenames(self):
self.model_file = self.filename_generator("model", "h5")
self.parameters_file = self.filename_generator("params", "pkl")
self.memory_file = self.filename_generator("memory", "pkl")
self.version_file = self.path + "v.pkl"
def filename_generator(self, filename, format):
return lambda v: self.path + filename + "_{}.{}".format(v, format)
def setup_filesystem(self, remote, layoutName, saveFile):
folder = ""
folder = "data"
self.path = "/home/skusku/" + folder +"/machinelearning/save_states/" + layoutName + "/" + saveFile + "/"
self.log_dir = self.path + "logs/"
if remote:
folder = "data"
else:
folder = "localdata"
self.path = "/home/skusku/" + folder +"/machinelearning/save_states/" + layoutName + "/" + saveFile + "/"
self.generate_filenames()
if not os.path.exists(self.path):
os.makedirs(self.path)
def sample_replay_memory(self, batch_size):
# We do not allow the last entry to be sampled, because it could be a terminal state, which has not been
# correctly labelled yet. Terminal states are only labelled as terminal once 'final(state)' is called.
# This explains the '-2' instead '-1'
idxs = np.random.random_integers(0, len(self.replay_memory)-2, batch_size)
return [self.replay_memory[i] for i in idxs]
def get_validation_set(self):
memories = self.sample_replay_memory(self.batch_size)
observations_batch, nextObservations_batch, \
actions_batch, reward_batch, nonterminal_batch = self.get_batches_from_memories(memories)
q_values = self.generate_targets(observations_batch,
nextObservations_batch,
actions_batch,
reward_batch,
nonterminal_batch)
return [np.array(observations_batch), np.array(q_values)]
def get_epsilon(self):
start = self.start_epsilon
end = self.middle_epsilon
offset = 0
if self.step > self.decay:
start = self.middle_epsilon
end = self.end_epsilon
offset = self.decay
decayed = start - (start - end) / self.decay * (self.step - offset)
return decayed if decayed >= self.end_epsilon else self.end_epsilon
def increment_step(self):
self.step += 1
self.game_step += 1
def try_loading_previous_version(self):
# Find latest version
if os.path.isfile(self.version_file):
with open(self.version_file, "rb") as ipt:
self.sub_version = pickle.load(ipt)
if self.isInTraining() and os.path.isfile(self.memory_file(self.sub_version)):
with open(self.memory_file(self.sub_version), "rb") as input:
self.replay_memory = pickle.load(input)
print("Loaded previous memory successfully")
if os.path.isfile(self.parameters_file(self.sub_version)):
with open(self.parameters_file(self.sub_version), "rb") as input:
self.step = pickle.load(input)
self.sub_version = pickle.load(input)
self.epoch = pickle.load(input)
print("Restarting in subversion {} from step {}, epoch {}".format(self.sub_version, self.step, self.epoch))
if os.path.isfile(self.model_file(self.sub_version)):
custom_objects = {"huber_loss": self.huber_loss, "DuelLayer": DuelLayer, "loss": self.loss, "NoisyDenseLayer": NoisyDenseLayer}
self.model = load_model(self.model_file(self.sub_version), custom_objects=custom_objects)
def generateCNN(self, input_shape, name, learning_rate):
ipt = Input(shape=input_shape)
permute = Permute((2, 3, 1))(ipt)
c1 = Convolution2D(32, (3, 3), strides=(1, 1), bias_initializer=Zeros(), kernel_initializer=VarianceScaling(scale=2))(permute)
a1 = Activation('relu')(c1)
c2 = Convolution2D(64, (3, 3), strides=(1, 1), bias_initializer=Zeros(), kernel_initializer=VarianceScaling(scale=2))(a1)
a2 = Activation('relu')(c2)
c3 = Convolution2D(64, (3, 3), strides=(1, 1), bias_initializer=Zeros(), kernel_initializer=VarianceScaling(scale=2))(a2)
a3 = Activation('relu')(c3)
flat = Flatten()(a3)
dense = Dense(self.nb_actions, bias_initializer=Zeros(), kernel_initializer=VarianceScaling(scale=2))(flat)
out = Activation('linear')(dense)
model = Model(inputs=ipt, outputs=out, name=name)
model.compile(loss=self.loss, optimizer=Adam(lr=learning_rate))
return model
def __init__(self,
numGames,
learning_rate=0.00025,
layout=None,
remote=0,
layoutName="mediumGrid",
saveFile="testfile",
decay=1000000,
replay_memory = None,
input_shape = None,
window_length = 2,
N_steps_lookahead = 2,
minsteps = 0,
**args):
ReinforcementAgent.__init__(self, **args)
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.4
K.backend.set_session(tf.Session(config=config))
self.minsteps = minsteps
signal.signal(signal.SIGINT, self.cleanup)
self.numGames = numGames
self.model = None
self.replay_memory = None
self.nb_steps_between_target_updates = 10000
self.nb_episodes_between_backups = 5000
self.setup_filesystem(remote, layoutName, saveFile)
self.sub_version = 0
self.huber_delta = tf.constant(1., dtype="float32")
self.window_length = window_length
self.input_shape = input_shape or (self.window_length, layout.width, layout.height)
self.ipt_width = self.input_shape[1]
self.ipt_height = self.input_shape[2]
self.layout_width = layout.width
self.layout_height = layout.height
self.batch_size = 32
self.learning_rate = learning_rate
self.gamma = .95
self.N_steps_lookahead = N_steps_lookahead
self.memory_size = 300000
self.nb_actions = 5
self.nb_warmup_steps = 1000
self.nb_max_rnd_start_steps = 5
self.nb_rnd_start_steps = 0
self.last_observations = deque(maxlen=self.N_steps_lookahead + self.window_length - 1)
self.last_actions = deque(maxlen=self.N_steps_lookahead)
self.last_rewards = deque(maxlen=self.N_steps_lookahead)
self.decay = decay
self.step = 0
self.game_step = 0
self.epoch = 0
self.final_score = 0
self.last_loss = 0
self.start_epsilon = 1.0
self.middle_epsilon = 0.1
self.end_epsilon = 0.002
self.saving = False
self.try_loading_previous_version()
if self.model is None:
# The model is the online model
print("Couldn't load model")
self.model = self.generateCNN(self.input_shape, "online_model", self.learning_rate)
# Create a fresh copy for the target model, which is used for generating the targets
self.target_model = self.generateCNN(self.input_shape, "target_model", self.learning_rate)
self.target_model.set_weights(self.model.get_weights())
if self.replay_memory is None:
self.replay_memory = replay_memory
if self.replay_memory is None:
self.replay_memory = deque(maxlen=self.memory_size)
tb = TensorBoardWrap(generator=self.get_validation_set, log_dir=self.log_dir, write_graph=True, write_grads=True, histogram_freq=100, batch_size=self.batch_size)
# tb = TensorBoard(log_dir=self.log_dir, write_graph=True)
self.callbacks = [tb]
self.callbacks = CallbackList(callbacks=self.callbacks)
self.callbacks.set_model(self.model)
def getGreedyAction(self, state):
q_values = self.model.predict_on_batch(np.reshape(list(self.last_observations)[-self.window_length:], (1, ) + self.input_shape))[0]
# q_values = self.model.predict_on_batch(np.reshape(list(self.last_observations), (1, ) + self.input_shape))[0]
sorted_indices_decreasing = np.argsort(q_values)[::-1]
for idx in sorted_indices_decreasing:
if idx in Actions.actionsAsIndices(self.getLegalActions(state)):
return Actions._directionsAsList[idx][0]
def getAction(self, state):
# Basically the forward pass.
action = None
observation = self.process_state(state)
self.last_observations.append(observation)
if self.isInTraining():
# Take the epsilon greedy action
eps = self.get_epsilon()
else:
# Take the greedy action
eps = 0
rnd = np.random.uniform(0, 1)
if self.game_step < self.nb_rnd_start_steps or rnd < eps:
action = np.random.choice(self.getLegalActions(state))
else:
action = self.getGreedyAction(state)
self.increment_step()
if self.step % self.nb_steps_between_target_updates == 0:
self.target_model.set_weights(self.model.get_weights())
self.doAction(state, action)
return action
def generate_return(self, rewards, q_values_next, nonterminal):
steps = len(rewards)
return np.sum([rewards[i] * self.gamma ** i for i in range(steps)]) + self.gamma ** steps * q_values_next * nonterminal
def generate_targets(self, observations_batch, nextObservations_batch, actions_batch, reward_batch, nonterminal_batch):
# First we predict on batch to get the actual q_values,
q_values = self.model.predict_on_batch(np.array(observations_batch))
# Q = r + argmaxQ'(...)
# Update the online model with the values generated in the target model
q_values_next = self.target_model.predict_on_batch(np.array(nextObservations_batch))
q_values_next = np.amax(q_values_next, axis=1)
# Then we update the q_values for the action we took
for q, action, rewards, qvn, nonterminal in zip(q_values,
actions_batch,
reward_batch,
q_values_next,
nonterminal_batch):
q[action] = self.generate_return(rewards, qvn, nonterminal)
return q_values
def get_batches_from_memories(self, memories):
observations_batch = []
actions_batch = []
nextObservations_batch = []
reward_batch = []
nonterminal_batch = []
for memory in memories:
observations_batch.append(memory['observations'])
actions_batch.append(memory['actions'])
nextObservations_batch.append(memory['nextObservations'])
reward_batch.append(memory['rewards'])
nonterminal_batch.append(memory['nonterminal'])
return (observations_batch, nextObservations_batch, actions_batch, reward_batch, nonterminal_batch)
def remember_state(self, observation, action, nextObservation, rewards):
self.replay_memory.append(
{
"observations": np.reshape(observation, self.input_shape),
"actions": Actions.actionsAsIndices([action])[0],
"nextObservations": np.reshape(nextObservation, self.input_shape),
"rewards": rewards,
"nonterminal": 1
})
def train(self, observation_batch, targets_batch):
self.last_loss = self.model.train_on_batch(x=observation_batch, y=targets_batch)
def update(self, state, action, nextState, reward):
# Normalize the rewards
# reward = reward / 500
self.last_actions.append(action)
self.last_rewards.append(reward)
# Let's jump out here when we have not seen enough states yet to fill our window
if self.isInTesting() or len(self.last_observations) < self.window_length:
return
nextObservation = self.process_state(nextState)
actionIndex = max(self.window_length - 1 - max(self.game_step - self.N_steps_lookahead, 0), 0)
self.remember_state(list(self.last_observations)[:self.window_length],
self.last_actions[actionIndex],
list(self.last_observations)[-self.window_length+1:] + [nextObservation],
list(self.last_rewards)[actionIndex:])
# Let's jump out here when we don't have enough samples in our replay memory yet.
if self.step < self.nb_warmup_steps:
return
self.latest_memories = self.sample_replay_memory(self.batch_size)
observations_batch, nextObservations_batch, \
actions_batch, reward_batch, nonterminal_batch = self.get_batches_from_memories(self.latest_memories)
q_values = self.generate_targets(observations_batch,
nextObservations_batch,
actions_batch,
reward_batch,
nonterminal_batch)
# And train on this batch.
self.train(np.array(observations_batch), np.array(q_values))
def saveEverything(self):
# Save the model
self.model.save(self.model_file(self.sub_version))
with open(self.memory_file(self.sub_version), "w") as output:
pickle.dump(self.replay_memory, output, pickle.HIGHEST_PROTOCOL)
with open(self.parameters_file(self.sub_version), "w") as output:
pickle.dump(self.step, output, pickle.HIGHEST_PROTOCOL)
pickle.dump(self.sub_version, output, pickle.HIGHEST_PROTOCOL)
pickle.dump(self.epoch, output, pickle.HIGHEST_PROTOCOL)
with open(self.version_file, "w") as opt:
pickle.dump(self.sub_version, opt, pickle.HIGHEST_PROTOCOL)
# After saving the newest state, we delete the older state to save some space...
if self.sub_version > 0:
try:
os.remove(self.model_file(self.sub_version - 1))
os.remove(self.memory_file(self.sub_version - 1))
os.remove(self.parameters_file(self.sub_version - 1))
except:
print("Previous version was already deleted.")
pass
print("Saved Model, dumped memory and parameters to pickle file, version {}.".format(self.sub_version))
self.sub_version += 1
def cleanup(self, sig=None, frame=None):
if not self.saving:
print("Press Ctrl+C again to skip saving.")
self.saving = True
self.saveEverything()
sys.exit(2)
else:
sys.exit(2)
def startEpisode(self):
ReinforcementAgent.startEpisode(self)
# Take at least window_length - 1 random steps, so that our window is filled when we try to predict on it.
self.nb_rnd_start_steps = np.random.random_integers(self.window_length, self.nb_max_rnd_start_steps) if self.isInTraining() else self.window_length - 1
self.game_step = 0
if self.isInTraining():
self.callbacks.on_epoch_begin(self.step)
self.last_observations.clear()
def stopEpisode(self):
ReinforcementAgent.stopEpisode(self)
logs = {"reward": self.final_score, "epsilon": self.get_epsilon(), "loss": self.last_loss}
if self.isInTraining() and not self.step < self.nb_warmup_steps:
self.callbacks.on_epoch_end(self.step, logs=logs)
if self.episodesSoFar % self.nb_episodes_between_backups == 0:
self.saveEverything()
self.epoch += 1
def final(self, state):
ReinforcementAgent.final(self, state)
if self.isInTraining():
self.replay_memory[-1]['nonterminal'] = 0
if self.episodesSoFar == self.numTraining:
self.saveEverything()
if self.episodesSoFar == self.numGames:
if self.minsteps < self.step:
sys.exit(100)
self.final_score = state.getScore()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment