Last active
February 8, 2019 18:58
-
-
Save kschoos/f3e96edbbe914b0f720f44b0e46326c9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import stuff | |
def identity_loss(y_true, y_pred): | |
return y_pred | |
class TensorBoardWrap(TensorBoard): | |
def __init__(self, val_data, **args): | |
TensorBoard.__init__(self, **args) | |
self.validation_data = val_data | |
def on_epoch_end(self, epoch, logs=None): | |
TensorBoard.on_epoch_end(self, epoch, logs) | |
class SumAcross(Layer): | |
def __init__(self, axis, **kwargs): | |
super(SumAcross, self).__init__(**kwargs) | |
self.axis = axis | |
def call(self, inputs): | |
return K.sum(inputs, axis=self.axis) | |
def get_config(self): | |
config = {'axis': self.axis } | |
base_config = super(SumAcross, self).get_config() | |
return dict(list(base_config.items()) + list(config.items())) | |
def compute_output_shape(self, input_shape): | |
return tuple(list(input_shape[:-1]) + [1]) | |
class ClippedLoss(Layer): | |
def __init__(self, delta_clip, **kwargs): | |
super(ClippedLoss, self).__init__(**kwargs) | |
self.delta_clip = delta_clip | |
def call(self, inputs): | |
return huber_loss(inputs[0], inputs[1], self.delta_clip) | |
def get_config(self): | |
config = {'delta_clip': self.delta_clip} | |
base_config = super(ClippedLoss, self).get_config() | |
return dict(list(base_config.items()) + list(config.items())) | |
def compute_output_shape(self, input_shape): | |
return input_shape[0] | |
def mean_q(y_true, y_pred): | |
return K.mean(K.max(y_pred, axis=-1)) | |
class EpsGreedyRestrictedPolicy(Policy): | |
"""Implement the epsilon greedy policy | |
Eps Greedy policy either: | |
- takes a random action with probability epsilon | |
- takes current best action with prob (1 - epsilon) | |
- Only takes aloud actions | |
""" | |
def __init__(self, eps=.1): | |
super(EpsGreedyRestrictedPolicy, self).__init__() | |
self.eps = eps | |
def select_action(self, q_values, legal_actions): | |
"""Return the selected action | |
# Arguments | |
q_values (np.ndarray): List of the estimations of Q for each action | |
# Returns | |
Selection action | |
""" | |
assert q_values.ndim == 1 | |
nb_actions = q_values.shape[0] | |
if np.random.uniform() < self.eps: | |
return np.random.choice(legal_actions) | |
# return np.random.random_integers(0, nb_actions-1) | |
else: | |
sorted_indices_decreasing = np.argsort(q_values)[::-1] | |
for a in sorted_indices_decreasing: | |
if a in legal_actions: | |
return a | |
return None | |
def get_config(self): | |
"""Return configurations of EpsGreedyQPolicy | |
# Returns | |
Dict of config | |
""" | |
config = super(EpsGreedyRestrictedPolicy, self).get_config() | |
config['eps'] = self.eps | |
return config | |
class GreedyRestrictedPolicy(Policy): | |
"""Implement the greedy policy | |
Greedy policy returns the current best action according to q_values | |
""" | |
def select_action(self, q_values, legal_actions): | |
"""Return the selected action | |
# Arguments | |
q_values (np.ndarray): List of the estimations of Q for each action | |
# Returns | |
Selection action | |
""" | |
assert q_values.ndim == 1 | |
sorted_indices_decreasing = np.argsort(q_values)[::-1] | |
for a in sorted_indices_decreasing: | |
if a in legal_actions: | |
return a | |
return None | |
class PACQNAgent(PacmanQAgent): | |
""" | |
ApproximateQLearningAgent | |
You should only have to overwrite getQValue | |
and update. All other QLearningAgent functions | |
should work as is. | |
""" | |
def generate_filenames(self): | |
self.model_weights = self.filename_generator("model_weights", "h5") | |
self.trainable_model_weights = self.filename_generator("trainable_model_weights", "h5") | |
self.target_model_weights = self.filename_generator("target_model_weights", "h5") | |
self.model_file = self.filename_generator("model", "h5") | |
self.trainable_model_file = self.filename_generator("trainable_model", "h5") | |
self.target_model_file = self.filename_generator("target_model", "h5") | |
self.parameters_file = self.filename_generator("params", "pkl") | |
self.memory_file = self.filename_generator("memory", "pkl") | |
self.version_file = self.path + "v.pkl" | |
PacmanQAgent.generate_filenames(self) | |
def __init__(self, decay, ghosts, **args): | |
# Next, we build our model. We use the same model that was described by Mnih et al. (2015). | |
# input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE | |
PacmanQAgent.__init__(self, **args) | |
self.nb_max_start_steps = 15 | |
WINDOW_LENGTH = 2 | |
CHANNELS = 4 | |
sys.setrecursionlimit(10000) | |
signal.signal(signal.SIGINT, self.cleanup) | |
self.input_shape = (WINDOW_LENGTH*CHANNELS,) + (args['layout'].width, args['layout'].height) | |
self.model = Sequential() | |
self.nb_actions = 5 | |
self.step = 0 | |
self.nb_steps_warmup = 50 | |
self.train_interval = 1 | |
self.batch_size = 16 | |
self.memory_interval = 1 | |
self.metrics_names = [] | |
self.enable_double_dqn = False | |
self.gamma = 0.95 | |
self.target_model = None | |
self.trainable_model = None | |
self.training = self.numTraining > 0 | |
self.test_policy = GreedyRestrictedPolicy() | |
self.recent_action = None | |
self.recent_observation = None | |
self.target_model_update = 1 | |
self.processor = None | |
self.accumulated_reward = 0 | |
self.nb_of_times_to_repeat_action = 1 | |
self.action_to_repeat = None | |
self.custom_model_objects = {} | |
self.delta_clip = np.inf | |
self.nb_random_start_steps = 0 | |
self.startEpsilon = args['startEpsilon'] | |
self.endEpsilon = args['endEpsilon'] | |
self.stepsThisEpisode = 0 | |
self.nb_episodes_between_backups = 2500 | |
self.log_interval = 10000 | |
self.epoch_metrics = [] | |
self.epoch_rewards = [] | |
self.global_epoch = 0 | |
self.start_food = np.sum(args['layout'].food.data) | |
### SETUP STUFF WITH NO DIRECT RELEVANCE ######################################## | |
self.generate_filenames() | |
log_dir = self.path + "logs/" | |
start_state = GameState() | |
start_state.initialize(args['layout'], ghosts) | |
start_state = np.reshape(np.array(2*[start_state.data.asMultipleArrays()]), (1,) + self.input_shape) | |
print(np.shape(start_state)) | |
val_data = [start_state, np.zeros((1, self.nb_actions)), np.ones((1, self.nb_actions)), np.zeros((1, 1)), np.zeros((1, 1)), np.ones((1,)), np.ones((1,))] | |
print(val_data[0].shape[0]) | |
tb = TensorBoardWrap(val_data=val_data, log_dir=log_dir, write_graph=True, write_grads=True, histogram_freq=100) | |
self.callbacks = [] | |
self.callbacks += [tb] | |
self.callbacks += [TrainIntervalLogger(interval=self.log_interval)] | |
self.callbacks += [TrainEpisodeLogger()] | |
params = { | |
'nb_steps': self.numTraining, | |
} | |
self.callbacks = CallbackList(callbacks=self.callbacks) | |
if hasattr(self.callbacks, 'set_params'): | |
self.callbacks.set_params(params) | |
else: | |
self.callbacks._set_params(params) | |
self.ipt = Input(shape=self.input_shape) | |
self.permute = Permute((2, 3, 1), input_shape=self.input_shape)(self.ipt) | |
self.c1 = Convolution2D(32, (3, 3), strides=(1, 1))(self.permute) | |
self.a1 = Activation('relu')(self.c1) | |
self.c2 = Convolution2D(64, (3, 3), strides=(1, 1))(self.a1) | |
self.a2 = Activation('relu')(self.c2) | |
self.c3 = Convolution2D(64, (3, 3), strides=(1, 1))(self.a2) | |
self.a3 = Activation('relu')(self.c3) | |
self.flat = Flatten()(self.a3) | |
self.dense = Dense(self.nb_actions)(self.flat) | |
self.out = Activation('linear')(self.dense) | |
self.model = Model(inputs=self.ipt, outputs=self.out) | |
print(self.model.summary()) | |
# Find latest version | |
if os.path.isfile(self.version_file): | |
with open(self.version_file, "rb") as ipt: | |
self.sub_version = pickle.load(ipt) | |
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and | |
# even the metrics! | |
if os.path.isfile(self.memory_file(self.sub_version)): | |
with open(self.memory_file(self.sub_version), "rb") as input: | |
self.memory = pickle.load(input) | |
print("Loaded previous memory successfully") | |
else: | |
self.memory = SequentialMemory(limit=300000, window_length=WINDOW_LENGTH) | |
print("Creating memory from scratch") | |
if os.path.isfile(self.parameters_file(self.sub_version)): | |
with open(self.parameters_file(self.sub_version), "rb") as input: | |
self.step = pickle.load(input) | |
self.sub_version = pickle.load(input) | |
self.global_epoch = pickle.load(input) | |
print("Loaded step value, sub version and global epoch.") | |
# processor = AtariProcessor() | |
# Select a policy. We use eps-greedy action selection, which means that a random action is selected | |
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that | |
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows | |
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 | |
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck. | |
# This is actually the EpsGreedy Restricted Policy (see top of this gist) | |
self.policy = LinearAnnealedPolicy(EpsGreedyRestrictedPolicy(), attr='eps', value_max=self.startEpsilon, value_min=self.endEpsilon, value_test=.05, | |
nb_steps=decay) | |
self.policy._set_agent(self) | |
if os.path.isfile(self.model_file(self.sub_version)) and \ | |
os.path.isfile(self.target_model_file(self.sub_version)) and \ | |
os.path.isfile(self.trainable_model_file(self.sub_version)): | |
custom_objects = {'ClippedLoss': ClippedLoss, 'identity_loss': identity_loss, 'SumAcross': SumAcross, 'mean_q': mean_q} | |
self.model = load_model(self.model_file(self.sub_version), custom_objects=custom_objects) | |
self.trainable_model = load_model(self.trainable_model_file(self.sub_version), custom_objects=custom_objects) | |
self.target_model = load_model(self.target_model_file(self.sub_version), custom_objects=custom_objects) | |
else: | |
if os.path.isfile(self.model_weights(self.sub_version)): | |
print("Loading model weights...") | |
self.model.load_weights(self.model_weights(self.sub_version)) | |
self.compile(Adam(lr=.00025, clipnorm=1., clipvalue=0.5), metrics=['mae']) | |
self.model.name = "MainModel" | |
self.trainable_model.name = "TrainableModel" | |
self.target_model.name = "TargetModel" | |
print(self.model.get_weights()) | |
self.callbacks.set_model(self.trainable_model) | |
print(self.model.summary()) | |
self.train_begin() | |
def load_weights(self, filepath): | |
self.model.load_weights(filepath) | |
self.update_target_model_hard() | |
self.update_trainable_model_hard() | |
def update_target_model_hard(self): | |
self.target_model.set_weights(self.model.get_weights()) | |
def update_trainable_model_hard(self): | |
self.trainable_model.set_weights(self.model.get_weights()) | |
def compileTrainableModel(self, optimizer, metrics=[]): | |
y_pred = self.model.output | |
y_true = Input(name='y_true', shape=(self.nb_actions,)) | |
mask = Input(name='mask', shape=(self.nb_actions,)) | |
clipped = ClippedLoss(delta_clip=self.delta_clip)([y_true, y_pred]) | |
clipped.trainable = False | |
masked = Multiply()([clipped, mask]) | |
masked.trainable = False | |
loss = SumAcross(axis=-1, name='loss')(masked) | |
loss.trainable = False | |
ins = [self.model.input] if type(self.model.input) is not list else self.model.input | |
trainable_model = Model(inputs=ins + [y_true, mask], outputs=[loss, y_pred]) | |
combined_metrics = {trainable_model.output_names[1]: metrics} | |
# tf.summary.scalar('trainable_model.loss', trainable_model.output[0]) | |
trainable_model.compile(optimizer=optimizer, loss=identity_loss, metrics=combined_metrics) | |
# tf.summary.merge_all() | |
if os.path.isfile(self.trainable_model_weights(self.sub_version)): | |
trainable_model.load_weights(self.trainable_model_weights(self.sub_version)) | |
return trainable_model | |
def compile(self, optimizer, metrics=[]): | |
metrics += [mean_q] # register default metrics | |
# We never train the target model, hence we can set the optimizer and loss arbitrarily. | |
self.target_model = clone_model(self.model, self.custom_model_objects) | |
if os.path.isfile(self.target_model_weights(self.sub_version)): | |
self.target_model.load_weights(self.target_model_weights) | |
self.target_model.compile(optimizer='sgd', loss='mse') | |
self.model.compile(optimizer='sgd', loss='mse') | |
# Compile model. | |
if self.target_model_update < 1.: | |
# We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model. | |
updates = get_soft_target_model_updates(self.target_model, self.model, self.target_model_update) | |
optimizer = AdditionalUpdatesOptimizer(optimizer, updates) | |
# Create trainable model. The problem is that we need to mask the output since we only | |
# ever want to update the Q values for a certain action. The way we achieve this is by | |
# using a custom Lambda layer that computes the loss. This gives us the necessary flexibility | |
# to mask out certain parameters by passing in multiple inputs to the Lambda layer. | |
self.trainable_model = self.compileTrainableModel(optimizer, metrics) | |
self.compiled = True | |
def update_target_model_hard(self): | |
self.target_model.set_weights(self.model.get_weights()) | |
def startTesting(self): | |
self.training = False | |
def getWeights(self): | |
return self.weights | |
def getQValue(self, state, action): | |
""" | |
Should return Q(state,action) = w * featureVector | |
where * is thedotProduct operator | |
""" | |
if not self.init: | |
print(self.featExtractor.getFeatures(state, action)) | |
self.init +=1 | |
sum = 0 | |
features = self.featExtractor.getFeatures(state, action) | |
for _, val in enumerate(self.featExtractor.getFeatures(state, action)): | |
sum += self.weights[val] * features[val] | |
return sum | |
def update(self, state, action, nextState, reward): | |
""" | |
Should update your weights based on transition | |
""" | |
"*** YOUR CODE HERE ***" | |
PacmanQAgent.update(self, state, action, nextState, reward) | |
self.accumulated_reward += reward | |
# if self.step % self.nb_of_times_to_repeat_action == 0 and self.training: | |
self.backward(reward, nextState is None, state) | |
self.increment_step() | |
def increment_step(self): | |
self.step += 1 | |
self.stepsThisEpisode += 1 | |
def process_observation(self, observation): | |
return observation.data.asMultipleArrays() | |
def process_state_batch(self, batch): | |
batch = np.array(batch) | |
return batch | |
def compute_batch_q_values(self, state_batch): | |
batch = self.process_state_batch(state_batch) | |
# print(batch) | |
q_values = self.model.predict_on_batch(batch) | |
assert q_values.shape == (len(state_batch), self.nb_actions) | |
return q_values | |
def compute_q_values(self, state): | |
q_values = self.compute_batch_q_values([state]).flatten() | |
assert q_values.shape == (self.nb_actions,) | |
return q_values | |
def forward(self, state): | |
# Select an action. | |
observation = self.process_observation(state) | |
recent_state = self.memory.get_recent_state(observation) | |
recent_state = np.reshape(recent_state, self.input_shape) | |
q_values = self.compute_q_values(recent_state) | |
if self.training: | |
legalActions = self.getLegalActions(state) | |
action = self.policy.select_action(q_values=q_values, legal_actions=Actions.actionsAsIndices(legalActions)) | |
else: | |
action = self.test_policy.select_action(q_values=q_values, legal_actions=Actions.actionsAsIndices(self.getLegalActions(state))) | |
# print("Saved weights debug data to 'during' file") | |
# Book-keeping. | |
self.recent_observation = observation | |
self.recent_action = action | |
return action | |
def backward(self, reward, terminal, state=None): | |
# Store most recent experience in memory. | |
if self.step % self.memory_interval == 0: | |
self.memory.append(self.recent_observation, self.recent_action, reward, terminal, | |
training=self.training) | |
metrics = [np.nan for _ in self.metrics_names] | |
if not self.training: | |
# We're done here. No need to update the experience memory since we only use the working | |
# memory to obtain the state over the most recent observations. | |
return metrics | |
# Train the network on a single stochastic batch. | |
try: | |
if self.step > self.nb_steps_warmup and self.step % self.train_interval == 0: | |
experiences = self.memory.sample(self.batch_size) | |
assert len(experiences) == self.batch_size | |
# Start by extracting the necessary parameters (we use a vectorized implementation). | |
state0_batch = [] | |
reward_batch = [] | |
action_batch = [] | |
terminal1_batch = [] | |
state1_batch = [] | |
for e in experiences: | |
state0_batch.append(e.state0) | |
state1_batch.append(e.state1) | |
reward_batch.append(e.reward) | |
action_batch.append(e.action) | |
terminal1_batch.append(0. if e.terminal1 else 1.) | |
# Prepare and validate parameters. | |
state0_batch = self.process_state_batch(state0_batch) | |
state1_batch = self.process_state_batch(state1_batch) | |
terminal1_batch = np.array(terminal1_batch) | |
reward_batch = np.array(reward_batch) | |
assert reward_batch.shape == (self.batch_size,) | |
assert terminal1_batch.shape == reward_batch.shape | |
assert len(action_batch) == len(reward_batch) | |
state0_batch = np.reshape(state0_batch, (self.batch_size,) + self.input_shape) | |
state1_batch = np.reshape(state1_batch, (self.batch_size,) + self.input_shape) | |
# Compute Q values for mini-batch update. | |
if self.enable_double_dqn: | |
# According to the paper "Deep Reinforcement Learning with Double Q-learning" | |
# (van Hasselt et al., 2015), in Double DQN, the online network predicts the actions | |
# while the target network is used to estimate the Q value. | |
q_values = self.model.predict_on_batch(state1_batch) | |
assert q_values.shape == (self.batch_size, self.nb_actions) | |
actions = np.argmax(q_values, axis=1) | |
assert actions.shape == (self.batch_size,) | |
# Now, estimate Q values using the target network but select the values with the | |
# highest Q value wrt to the online model (as computed above). | |
target_q_values = self.target_model.predict_on_batch(state1_batch) | |
assert target_q_values.shape == (self.batch_size, self.nb_actions) | |
q_batch = target_q_values[range(self.batch_size), actions] | |
else: | |
# Compute the q_values given state1, and extract the maximum for each sample in the batch. | |
# We perform this prediction on the target_model instead of the model for reasons | |
# outlined in Mnih (2015). In short: it makes the algorithm more stable. | |
target_q_values = self.target_model.predict_on_batch(state1_batch) | |
assert target_q_values.shape == (self.batch_size, self.nb_actions) | |
q_batch = np.max(target_q_values, axis=1).flatten() | |
assert q_batch.shape == (self.batch_size,) | |
targets = np.zeros((self.batch_size, self.nb_actions)) | |
dummy_targets = np.zeros((self.batch_size,)) | |
masks = np.zeros((self.batch_size, self.nb_actions)) | |
# Compute r_t + gamma * max_a Q(s_t+1, a) and update the target targets accordingly, | |
# but only for the affected output units (as given by action_batch). | |
discounted_reward_batch = self.gamma * q_batch | |
# Set discounted reward to zero for all states that were terminal. | |
discounted_reward_batch *= terminal1_batch | |
assert discounted_reward_batch.shape == reward_batch.shape | |
Rs = reward_batch + discounted_reward_batch | |
for idx, (target, mask, R, action) in enumerate(zip(targets, masks, Rs, action_batch)): | |
target[action] = R # update action with estimated accumulated reward | |
dummy_targets[idx] = R | |
mask[action] = 1. # enable loss for this specific action | |
targets = np.array(targets).astype('float32') | |
masks = np.array(masks).astype('float32') | |
# Finally, perform a single update on the entire batch. We use a dummy target since | |
# the actual loss is computed in a Lambda layer that needs more complex input. However, | |
# it is still useful to know the actual target to compute metrics properly. | |
ins = [state0_batch] if type(self.model.input) is not list else state0_batch | |
# self.callbacks.on_train_batch_begin(batch=ins) | |
metrics = self.trainable_model.train_on_batch(ins + [targets, masks], [dummy_targets, targets]) | |
self.callbacks.on_train_batch_end(batch=ins, logs=metrics) | |
metrics = [metric for idx, metric in enumerate(metrics) if idx not in (1, 2)] # throw away individual losses | |
metrics += self.policy.metrics | |
d = dict(zip(self.trainable_model.metrics_names, metrics)) | |
self.epoch_metrics += [d] | |
if self.processor is not None: | |
metrics += self.processor.metrics | |
if self.target_model_update >= 1 and self.step % self.target_model_update == 0: | |
self.update_target_model_hard() | |
except AssertionError: | |
pass | |
return metrics | |
def train_begin(self): | |
self.callbacks.on_train_begin() | |
def episode_begin(self): | |
self.callbacks.on_epoch_begin(self.global_epoch) | |
def episode_end(self): | |
logs = {} | |
if len(self.epoch_metrics) > 0: | |
logs = {key: np.mean([entry[key] for entry in self.epoch_metrics]) for key, value in self.epoch_metrics[0].items()} | |
more_logs = {"av_reward": np.mean(self.epoch_rewards), "eps": self.policy.get_current_value()} | |
self.epoch_rewards = [] | |
self.callbacks.on_epoch_end(self.global_epoch, logs=dict(logs.items() + more_logs.items())) | |
self.global_epoch += 1 | |
def train_end(self): | |
self.callbacks.on_train_end() | |
def getAction(self, state): | |
if self.stepsThisEpisode == 0: | |
self.episode_begin() | |
if self.nb_random_start_steps == 0: | |
self.nb_random_start_steps = np.random.randint(self.nb_max_start_steps) | |
if self.stepsThisEpisode < self.nb_random_start_steps: | |
action = np.random.choice(self.getLegalActions(state)) | |
self.recent_observation = self.process_observation(state) | |
self.recent_action = Actions.actionsAsIndices([action])[0] | |
else: | |
action = self.forward(state) | |
action = Actions._possibleActions[action] | |
self.doAction(state, action) | |
return action | |
def saveEverything(self): | |
# Save the model | |
self.model.save(self.model_file(self.sub_version)) | |
self.trainable_model.save(self.trainable_model_file(self.sub_version)) | |
self.target_model.save(self.target_model_file(self.sub_version)) | |
with open(self.memory_file(self.sub_version), "w") as output: | |
pickle.dump(self.memory, output, pickle.HIGHEST_PROTOCOL) | |
with open(self.parameters_file(self.sub_version), "w") as output: | |
pickle.dump(self.step, output, pickle.HIGHEST_PROTOCOL) | |
pickle.dump(self.sub_version, output, pickle.HIGHEST_PROTOCOL) | |
pickle.dump(self.global_epoch, output, pickle.HIGHEST_PROTOCOL) | |
with open(self.version_file, "w") as opt: | |
pickle.dump(self.sub_version, opt, pickle.HIGHEST_PROTOCOL) | |
# After saving the newest state, we delete the older state to save some space... | |
if self.sub_version > 0: | |
try: | |
os.remove(self.model_file(self.sub_version - 1)) | |
os.remove(self.trainable_model_file(self.sub_version - 1)) | |
os.remove(self.target_model_file(self.sub_version - 1)) | |
os.remove(self.memory_file(self.sub_version - 1)) | |
os.remove(self.parameters_file(self.sub_version - 1)) | |
except: | |
print("Previous version was already deleted.") | |
pass | |
print("Saved Model, dumped memory and parameters to pickle file, version {}.".format(self.sub_version)) | |
self.sub_version += 1 | |
def cleanup(self, sig, frame): | |
if self.training: | |
self.training = False | |
self.saveEverything() | |
sys.exit(0) | |
else: | |
sys.exit(0) | |
def final(self, state): | |
"Called at the end of each game." | |
# call the super-class final method | |
PacmanQAgent.final(self, state) | |
self.nb_random_start_steps = 0 | |
self.stepsThisEpisode = 0 | |
self.epoch_rewards += [self.accumulated_reward] | |
self.accumulated_reward = 0 | |
self.episode_end() | |
if self.episodesSoFar % self.nb_episodes_between_backups == 0: | |
self.saveEverything() | |
# print("Episode: {}".format(self.episodesSoFar)) | |
# did we finish training? | |
if self.episodesSoFar == self.numTraining: | |
self.training = False | |
self.train_end() | |
self.saveEverything() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment