Last active
March 7, 2019 15:50
-
-
Save kschoos/fc3c85141908cc15eb02fda6dd2dcf4b to your computer and use it in GitHub Desktop.
DQN
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def asArray(state): | |
array <- zeros(state.width, state.length) | |
for x, y in state.width, state.length: | |
if x,y has food: | |
array[x][y] = 0.2 | |
if x,y has wall: | |
array[x][y] = 0.1 | |
if x,y has pacman: | |
array[x][y] = 0.5 | |
if x,y has ghost: | |
array[x][y] = 0.7 | |
if x,y has scared_ghost: | |
array[x][y] = 0.9 | |
if x,y has capsule: | |
array[x][y] = 0.3 | |
return array |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class NewDQNAgent(ReinforcementAgent): | |
def loss(self, y_true, y_pred): | |
return self.huber_loss(y_true, y_pred) | |
def huber_loss(self, y_true, y_pred): | |
''' | |
Design Huber Loss according to wikipedia: | |
L(e) = 1/2 e^2 if |e| <= d, else d(|e| - 1/2d) | |
''' | |
error = tf.math.subtract(y_true, y_pred) | |
abs_error = tf.math.abs(error) | |
quadratic = tf.math.minimum(abs_error, self.huber_delta) | |
linear = tf.math.subtract(abs_error, quadratic) | |
losses = tf.math.add( | |
tf.math.multiply( | |
tf.constant(0.5, | |
dtype=quadratic.dtype), | |
tf.math.multiply(quadratic, quadratic)), | |
tf.math.multiply( | |
self.huber_delta, | |
linear | |
)) | |
return losses | |
def process_state(self, state): | |
observation = state.data.asArray() | |
return observation | |
def generate_filenames(self): | |
self.model_file = self.filename_generator("model", "h5") | |
self.parameters_file = self.filename_generator("params", "pkl") | |
self.memory_file = self.filename_generator("memory", "pkl") | |
self.version_file = self.path + "v.pkl" | |
def filename_generator(self, filename, format): | |
return lambda v: self.path + filename + "_{}.{}".format(v, format) | |
def setup_filesystem(self, remote, layoutName, saveFile): | |
folder = "" | |
folder = "data" | |
self.path = "/home/skusku/" + folder +"/machinelearning/save_states/" + layoutName + "/" + saveFile + "/" | |
self.log_dir = self.path + "logs/" | |
if remote: | |
folder = "data" | |
else: | |
folder = "localdata" | |
self.path = "/home/skusku/" + folder +"/machinelearning/save_states/" + layoutName + "/" + saveFile + "/" | |
self.generate_filenames() | |
if not os.path.exists(self.path): | |
os.makedirs(self.path) | |
def sample_replay_memory(self, batch_size): | |
# We do not allow the last entry to be sampled, because it could be a terminal state, which has not been | |
# correctly labelled yet. Terminal states are only labelled as terminal once 'final(state)' is called. | |
# This explains the '-2' instead '-1' | |
idxs = np.random.random_integers(0, len(self.replay_memory)-2, batch_size) | |
return [self.replay_memory[i] for i in idxs] | |
def get_validation_set(self): | |
memories = self.sample_replay_memory(self.batch_size) | |
observations_batch, nextObservations_batch, \ | |
actions_batch, reward_batch, nonterminal_batch = self.get_batches_from_memories(memories) | |
q_values = self.generate_targets(observations_batch, | |
nextObservations_batch, | |
actions_batch, | |
reward_batch, | |
nonterminal_batch) | |
return [np.array(observations_batch), np.array(q_values)] | |
def get_epsilon(self): | |
start = self.start_epsilon | |
end = self.middle_epsilon | |
offset = 0 | |
if self.step > self.decay: | |
start = self.middle_epsilon | |
end = self.end_epsilon | |
offset = self.decay | |
decayed = start - (start - end) / self.decay * (self.step - offset) | |
return decayed if decayed >= self.end_epsilon else self.end_epsilon | |
def increment_step(self): | |
self.step += 1 | |
self.game_step += 1 | |
def try_loading_previous_version(self): | |
# Find latest version | |
if os.path.isfile(self.version_file): | |
with open(self.version_file, "rb") as ipt: | |
self.sub_version = pickle.load(ipt) | |
if self.isInTraining() and os.path.isfile(self.memory_file(self.sub_version)): | |
with open(self.memory_file(self.sub_version), "rb") as input: | |
self.replay_memory = pickle.load(input) | |
print("Loaded previous memory successfully") | |
if os.path.isfile(self.parameters_file(self.sub_version)): | |
with open(self.parameters_file(self.sub_version), "rb") as input: | |
self.step = pickle.load(input) | |
self.sub_version = pickle.load(input) | |
self.epoch = pickle.load(input) | |
print("Restarting in subversion {} from step {}, epoch {}".format(self.sub_version, self.step, self.epoch)) | |
if os.path.isfile(self.model_file(self.sub_version)): | |
custom_objects = {"huber_loss": self.huber_loss, "DuelLayer": DuelLayer, "loss": self.loss, "NoisyDenseLayer": NoisyDenseLayer} | |
self.model = load_model(self.model_file(self.sub_version), custom_objects=custom_objects) | |
def generateCNN(self, input_shape, name, learning_rate): | |
ipt = Input(shape=input_shape) | |
permute = Permute((2, 3, 1))(ipt) | |
c1 = Convolution2D(32, (3, 3), strides=(1, 1), bias_initializer=Zeros(), kernel_initializer=VarianceScaling(scale=2))(permute) | |
a1 = Activation('relu')(c1) | |
c2 = Convolution2D(64, (3, 3), strides=(1, 1), bias_initializer=Zeros(), kernel_initializer=VarianceScaling(scale=2))(a1) | |
a2 = Activation('relu')(c2) | |
c3 = Convolution2D(64, (3, 3), strides=(1, 1), bias_initializer=Zeros(), kernel_initializer=VarianceScaling(scale=2))(a2) | |
a3 = Activation('relu')(c3) | |
flat = Flatten()(a3) | |
dense = Dense(self.nb_actions, bias_initializer=Zeros(), kernel_initializer=VarianceScaling(scale=2))(flat) | |
out = Activation('linear')(dense) | |
model = Model(inputs=ipt, outputs=out, name=name) | |
model.compile(loss=self.loss, optimizer=Adam(lr=learning_rate)) | |
return model | |
def __init__(self, | |
numGames, | |
learning_rate=0.00025, | |
layout=None, | |
remote=0, | |
layoutName="mediumGrid", | |
saveFile="testfile", | |
decay=1000000, | |
replay_memory = None, | |
input_shape = None, | |
window_length = 2, | |
N_steps_lookahead = 2, | |
minsteps = 0, | |
**args): | |
ReinforcementAgent.__init__(self, **args) | |
config = tf.ConfigProto() | |
config.gpu_options.per_process_gpu_memory_fraction = 0.4 | |
K.backend.set_session(tf.Session(config=config)) | |
self.minsteps = minsteps | |
signal.signal(signal.SIGINT, self.cleanup) | |
self.numGames = numGames | |
self.model = None | |
self.replay_memory = None | |
self.nb_steps_between_target_updates = 10000 | |
self.nb_episodes_between_backups = 5000 | |
self.setup_filesystem(remote, layoutName, saveFile) | |
self.sub_version = 0 | |
self.huber_delta = tf.constant(1., dtype="float32") | |
self.window_length = window_length | |
self.input_shape = input_shape or (self.window_length, layout.width, layout.height) | |
self.ipt_width = self.input_shape[1] | |
self.ipt_height = self.input_shape[2] | |
self.layout_width = layout.width | |
self.layout_height = layout.height | |
self.batch_size = 32 | |
self.learning_rate = learning_rate | |
self.gamma = .95 | |
self.N_steps_lookahead = N_steps_lookahead | |
self.memory_size = 300000 | |
self.nb_actions = 5 | |
self.nb_warmup_steps = 1000 | |
self.nb_max_rnd_start_steps = 5 | |
self.nb_rnd_start_steps = 0 | |
self.last_observations = deque(maxlen=self.N_steps_lookahead + self.window_length - 1) | |
self.last_actions = deque(maxlen=self.N_steps_lookahead) | |
self.last_rewards = deque(maxlen=self.N_steps_lookahead) | |
self.decay = decay | |
self.step = 0 | |
self.game_step = 0 | |
self.epoch = 0 | |
self.final_score = 0 | |
self.last_loss = 0 | |
self.start_epsilon = 1.0 | |
self.middle_epsilon = 0.1 | |
self.end_epsilon = 0.002 | |
self.saving = False | |
self.try_loading_previous_version() | |
if self.model is None: | |
# The model is the online model | |
print("Couldn't load model") | |
self.model = self.generateCNN(self.input_shape, "online_model", self.learning_rate) | |
# Create a fresh copy for the target model, which is used for generating the targets | |
self.target_model = self.generateCNN(self.input_shape, "target_model", self.learning_rate) | |
self.target_model.set_weights(self.model.get_weights()) | |
if self.replay_memory is None: | |
self.replay_memory = replay_memory | |
if self.replay_memory is None: | |
self.replay_memory = deque(maxlen=self.memory_size) | |
tb = TensorBoardWrap(generator=self.get_validation_set, log_dir=self.log_dir, write_graph=True, write_grads=True, histogram_freq=100, batch_size=self.batch_size) | |
# tb = TensorBoard(log_dir=self.log_dir, write_graph=True) | |
self.callbacks = [tb] | |
self.callbacks = CallbackList(callbacks=self.callbacks) | |
self.callbacks.set_model(self.model) | |
def getGreedyAction(self, state): | |
q_values = self.model.predict_on_batch(np.reshape(list(self.last_observations)[-self.window_length:], (1, ) + self.input_shape))[0] | |
# q_values = self.model.predict_on_batch(np.reshape(list(self.last_observations), (1, ) + self.input_shape))[0] | |
sorted_indices_decreasing = np.argsort(q_values)[::-1] | |
for idx in sorted_indices_decreasing: | |
if idx in Actions.actionsAsIndices(self.getLegalActions(state)): | |
return Actions._directionsAsList[idx][0] | |
def getAction(self, state): | |
# Basically the forward pass. | |
action = None | |
observation = self.process_state(state) | |
self.last_observations.append(observation) | |
if self.isInTraining(): | |
# Take the epsilon greedy action | |
eps = self.get_epsilon() | |
else: | |
# Take the greedy action | |
eps = 0 | |
rnd = np.random.uniform(0, 1) | |
if self.game_step < self.nb_rnd_start_steps or rnd < eps: | |
action = np.random.choice(self.getLegalActions(state)) | |
else: | |
action = self.getGreedyAction(state) | |
self.increment_step() | |
if self.step % self.nb_steps_between_target_updates == 0: | |
self.target_model.set_weights(self.model.get_weights()) | |
self.doAction(state, action) | |
return action | |
def generate_return(self, rewards, q_values_next, nonterminal): | |
steps = len(rewards) | |
return np.sum([rewards[i] * self.gamma ** i for i in range(steps)]) + self.gamma ** steps * q_values_next * nonterminal | |
def generate_targets(self, observations_batch, nextObservations_batch, actions_batch, reward_batch, nonterminal_batch): | |
# First we predict on batch to get the actual q_values, | |
q_values = self.model.predict_on_batch(np.array(observations_batch)) | |
# Q = r + argmaxQ'(...) | |
# Update the online model with the values generated in the target model | |
q_values_next = self.target_model.predict_on_batch(np.array(nextObservations_batch)) | |
q_values_next = np.amax(q_values_next, axis=1) | |
# Then we update the q_values for the action we took | |
for q, action, rewards, qvn, nonterminal in zip(q_values, | |
actions_batch, | |
reward_batch, | |
q_values_next, | |
nonterminal_batch): | |
q[action] = self.generate_return(rewards, qvn, nonterminal) | |
return q_values | |
def get_batches_from_memories(self, memories): | |
observations_batch = [] | |
actions_batch = [] | |
nextObservations_batch = [] | |
reward_batch = [] | |
nonterminal_batch = [] | |
for memory in memories: | |
observations_batch.append(memory['observations']) | |
actions_batch.append(memory['actions']) | |
nextObservations_batch.append(memory['nextObservations']) | |
reward_batch.append(memory['rewards']) | |
nonterminal_batch.append(memory['nonterminal']) | |
return (observations_batch, nextObservations_batch, actions_batch, reward_batch, nonterminal_batch) | |
def remember_state(self, observation, action, nextObservation, rewards): | |
self.replay_memory.append( | |
{ | |
"observations": np.reshape(observation, self.input_shape), | |
"actions": Actions.actionsAsIndices([action])[0], | |
"nextObservations": np.reshape(nextObservation, self.input_shape), | |
"rewards": rewards, | |
"nonterminal": 1 | |
}) | |
def train(self, observation_batch, targets_batch): | |
self.last_loss = self.model.train_on_batch(x=observation_batch, y=targets_batch) | |
def update(self, state, action, nextState, reward): | |
# Normalize the rewards | |
# reward = reward / 500 | |
self.last_actions.append(action) | |
self.last_rewards.append(reward) | |
# Let's jump out here when we have not seen enough states yet to fill our window | |
if self.isInTesting() or len(self.last_observations) < self.window_length: | |
return | |
nextObservation = self.process_state(nextState) | |
actionIndex = max(self.window_length - 1 - max(self.game_step - self.N_steps_lookahead, 0), 0) | |
self.remember_state(list(self.last_observations)[:self.window_length], | |
self.last_actions[actionIndex], | |
list(self.last_observations)[-self.window_length+1:] + [nextObservation], | |
list(self.last_rewards)[actionIndex:]) | |
# Let's jump out here when we don't have enough samples in our replay memory yet. | |
if self.step < self.nb_warmup_steps: | |
return | |
self.latest_memories = self.sample_replay_memory(self.batch_size) | |
observations_batch, nextObservations_batch, \ | |
actions_batch, reward_batch, nonterminal_batch = self.get_batches_from_memories(self.latest_memories) | |
q_values = self.generate_targets(observations_batch, | |
nextObservations_batch, | |
actions_batch, | |
reward_batch, | |
nonterminal_batch) | |
# And train on this batch. | |
self.train(np.array(observations_batch), np.array(q_values)) | |
def saveEverything(self): | |
# Save the model | |
self.model.save(self.model_file(self.sub_version)) | |
with open(self.memory_file(self.sub_version), "w") as output: | |
pickle.dump(self.replay_memory, output, pickle.HIGHEST_PROTOCOL) | |
with open(self.parameters_file(self.sub_version), "w") as output: | |
pickle.dump(self.step, output, pickle.HIGHEST_PROTOCOL) | |
pickle.dump(self.sub_version, output, pickle.HIGHEST_PROTOCOL) | |
pickle.dump(self.epoch, output, pickle.HIGHEST_PROTOCOL) | |
with open(self.version_file, "w") as opt: | |
pickle.dump(self.sub_version, opt, pickle.HIGHEST_PROTOCOL) | |
# After saving the newest state, we delete the older state to save some space... | |
if self.sub_version > 0: | |
try: | |
os.remove(self.model_file(self.sub_version - 1)) | |
os.remove(self.memory_file(self.sub_version - 1)) | |
os.remove(self.parameters_file(self.sub_version - 1)) | |
except: | |
print("Previous version was already deleted.") | |
pass | |
print("Saved Model, dumped memory and parameters to pickle file, version {}.".format(self.sub_version)) | |
self.sub_version += 1 | |
def cleanup(self, sig=None, frame=None): | |
if not self.saving: | |
print("Press Ctrl+C again to skip saving.") | |
self.saving = True | |
self.saveEverything() | |
sys.exit(2) | |
else: | |
sys.exit(2) | |
def startEpisode(self): | |
ReinforcementAgent.startEpisode(self) | |
# Take at least window_length - 1 random steps, so that our window is filled when we try to predict on it. | |
self.nb_rnd_start_steps = np.random.random_integers(self.window_length, self.nb_max_rnd_start_steps) if self.isInTraining() else self.window_length - 1 | |
self.game_step = 0 | |
if self.isInTraining(): | |
self.callbacks.on_epoch_begin(self.step) | |
self.last_observations.clear() | |
def stopEpisode(self): | |
ReinforcementAgent.stopEpisode(self) | |
logs = {"reward": self.final_score, "epsilon": self.get_epsilon(), "loss": self.last_loss} | |
if self.isInTraining() and not self.step < self.nb_warmup_steps: | |
self.callbacks.on_epoch_end(self.step, logs=logs) | |
if self.episodesSoFar % self.nb_episodes_between_backups == 0: | |
self.saveEverything() | |
self.epoch += 1 | |
def final(self, state): | |
ReinforcementAgent.final(self, state) | |
if self.isInTraining(): | |
self.replay_memory[-1]['nonterminal'] = 0 | |
if self.episodesSoFar == self.numTraining: | |
self.saveEverything() | |
if self.episodesSoFar == self.numGames: | |
if self.minsteps < self.step: | |
sys.exit(100) | |
self.final_score = state.getScore() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment