kschoos/dqnagent.py

## dqnagent.py
import stuff

def identity_loss(y_true, y_pred):
    return y_pred

class TensorBoardWrap(TensorBoard):
    def __init__(self, val_data, **args):
        TensorBoard.__init__(self, **args)
        self.validation_data = val_data

    def on_epoch_end(self, epoch, logs=None):
        TensorBoard.on_epoch_end(self, epoch, logs)


class SumAcross(Layer):
    def __init__(self, axis, **kwargs):
        super(SumAcross, self).__init__(**kwargs)
        self.axis = axis

    def call(self, inputs):
        return K.sum(inputs, axis=self.axis)

    def get_config(self):
        config = {'axis': self.axis }
        base_config = super(SumAcross, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    def compute_output_shape(self, input_shape):
        return tuple(list(input_shape[:-1]) + [1])

class ClippedLoss(Layer):
    def __init__(self, delta_clip, **kwargs):
        super(ClippedLoss, self).__init__(**kwargs)
        self.delta_clip = delta_clip

    def call(self, inputs):
        return huber_loss(inputs[0], inputs[1], self.delta_clip)

    def get_config(self):
        config = {'delta_clip': self.delta_clip}
        base_config = super(ClippedLoss, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    def compute_output_shape(self, input_shape):
        return input_shape[0]

def mean_q(y_true, y_pred):
    return K.mean(K.max(y_pred, axis=-1))

class EpsGreedyRestrictedPolicy(Policy):
    """Implement the epsilon greedy policy

    Eps Greedy policy either:

    - takes a random action with probability epsilon
    - takes current best action with prob (1 - epsilon)
    - Only takes aloud actions
    """
    def __init__(self, eps=.1):
        super(EpsGreedyRestrictedPolicy, self).__init__()
        self.eps = eps

    def select_action(self, q_values, legal_actions):
        """Return the selected action

        # Arguments
            q_values (np.ndarray): List of the estimations of Q for each action

        # Returns
            Selection action
        """
        assert q_values.ndim == 1
        nb_actions = q_values.shape[0]


        if np.random.uniform() < self.eps:
            return np.random.choice(legal_actions)
            # return np.random.random_integers(0, nb_actions-1)
        else:
            sorted_indices_decreasing = np.argsort(q_values)[::-1]
            for a in sorted_indices_decreasing:
                if a in legal_actions:
                    return a

        return None

    def get_config(self):
        """Return configurations of EpsGreedyQPolicy

        # Returns
            Dict of config
        """
        config = super(EpsGreedyRestrictedPolicy, self).get_config()
        config['eps'] = self.eps
        return config

class GreedyRestrictedPolicy(Policy):
    """Implement the greedy policy

    Greedy policy returns the current best action according to q_values
    """
    def select_action(self, q_values, legal_actions):
        """Return the selected action

        # Arguments
            q_values (np.ndarray): List of the estimations of Q for each action

        # Returns
            Selection action
        """
        assert q_values.ndim == 1

        sorted_indices_decreasing = np.argsort(q_values)[::-1]
        for a in sorted_indices_decreasing:
            if a in legal_actions:
                return a

        return None

class PACQNAgent(PacmanQAgent):
    """
       ApproximateQLearningAgent

       You should only have to overwrite getQValue
       and update.  All other QLearningAgent functions
       should work as is.
    """


    def generate_filenames(self):
        self.model_weights = self.filename_generator("model_weights", "h5")
        self.trainable_model_weights = self.filename_generator("trainable_model_weights", "h5")
        self.target_model_weights = self.filename_generator("target_model_weights", "h5")

        self.model_file = self.filename_generator("model", "h5")
        self.trainable_model_file =  self.filename_generator("trainable_model", "h5")
        self.target_model_file = self.filename_generator("target_model", "h5")

        self.parameters_file = self.filename_generator("params", "pkl")
        self.memory_file = self.filename_generator("memory", "pkl")

        self.version_file = self.path + "v.pkl"

        PacmanQAgent.generate_filenames(self)

    def __init__(self, decay, ghosts, **args):
        # Next, we build our model. We use the same model that was described by Mnih et al. (2015).
        # input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE
        PacmanQAgent.__init__(self, **args)
        self.nb_max_start_steps = 15
        WINDOW_LENGTH = 2
        CHANNELS = 4
        sys.setrecursionlimit(10000)
        signal.signal(signal.SIGINT, self.cleanup)

        self.input_shape = (WINDOW_LENGTH*CHANNELS,) + (args['layout'].width, args['layout'].height)
        self.model = Sequential()
        self.nb_actions = 5
        self.step = 0
        self.nb_steps_warmup = 50
        self.train_interval = 1
        self.batch_size = 16
        self.memory_interval = 1
        self.metrics_names = []
        self.enable_double_dqn = False
        self.gamma = 0.95
        self.target_model = None
        self.trainable_model = None
        self.training = self.numTraining > 0
        self.test_policy = GreedyRestrictedPolicy()
        self.recent_action = None
        self.recent_observation = None
        self.target_model_update = 1
        self.processor = None
        self.accumulated_reward = 0
        self.nb_of_times_to_repeat_action = 1
        self.action_to_repeat = None
        self.custom_model_objects = {}
        self.delta_clip = np.inf
        self.nb_random_start_steps = 0
        self.startEpsilon = args['startEpsilon']
        self.endEpsilon = args['endEpsilon']
        self.stepsThisEpisode = 0
        self.nb_episodes_between_backups = 2500
        self.log_interval = 10000
        self.epoch_metrics = []
        self.epoch_rewards = []
        self.global_epoch = 0
        self.start_food = np.sum(args['layout'].food.data)

        ### SETUP STUFF WITH NO DIRECT RELEVANCE ########################################
        self.generate_filenames()
        log_dir = self.path + "logs/"

        start_state = GameState()
        start_state.initialize(args['layout'], ghosts)
        start_state = np.reshape(np.array(2*[start_state.data.asMultipleArrays()]), (1,) + self.input_shape)

        print(np.shape(start_state))

        val_data = [start_state, np.zeros((1, self.nb_actions)), np.ones((1, self.nb_actions)), np.zeros((1, 1)), np.zeros((1, 1)), np.ones((1,)), np.ones((1,))]

        print(val_data[0].shape[0])

        tb = TensorBoardWrap(val_data=val_data, log_dir=log_dir, write_graph=True, write_grads=True, histogram_freq=100)

        self.callbacks = []
        self.callbacks += [tb]
        self.callbacks += [TrainIntervalLogger(interval=self.log_interval)]
        self.callbacks += [TrainEpisodeLogger()]

        params = {
            'nb_steps': self.numTraining,
        }

        self.callbacks = CallbackList(callbacks=self.callbacks)

        if hasattr(self.callbacks, 'set_params'):
            self.callbacks.set_params(params)
        else:
            self.callbacks._set_params(params)

        self.ipt = Input(shape=self.input_shape)
        self.permute = Permute((2, 3, 1), input_shape=self.input_shape)(self.ipt)
        self.c1 = Convolution2D(32, (3, 3), strides=(1, 1))(self.permute)
        self.a1 = Activation('relu')(self.c1)
        self.c2 = Convolution2D(64, (3, 3), strides=(1, 1))(self.a1)
        self.a2 = Activation('relu')(self.c2)
        self.c3 = Convolution2D(64, (3, 3), strides=(1, 1))(self.a2)
        self.a3 = Activation('relu')(self.c3)
        self.flat = Flatten()(self.a3)
        self.dense = Dense(self.nb_actions)(self.flat)
        self.out = Activation('linear')(self.dense)
        self.model = Model(inputs=self.ipt, outputs=self.out)

        print(self.model.summary())

        # Find latest version
        if os.path.isfile(self.version_file):
            with open(self.version_file, "rb") as ipt:
                self.sub_version = pickle.load(ipt)

        # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
        # even the metrics!
        if os.path.isfile(self.memory_file(self.sub_version)):
            with open(self.memory_file(self.sub_version), "rb") as input:
                self.memory = pickle.load(input)
                print("Loaded previous memory successfully")
        else:
            self.memory = SequentialMemory(limit=300000, window_length=WINDOW_LENGTH)
            print("Creating memory from scratch")

        if os.path.isfile(self.parameters_file(self.sub_version)):
            with open(self.parameters_file(self.sub_version), "rb") as input:
                self.step = pickle.load(input)
                self.sub_version = pickle.load(input)
                self.global_epoch = pickle.load(input)
                print("Loaded step value, sub version and global epoch.")
        # processor = AtariProcessor()

        # Select a policy. We use eps-greedy action selection, which means that a random action is selected
        # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
        # the agent initially explores the environment (high eps) and then gradually sticks to what it knows
        # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
        # so that the agent still performs some random actions. This ensures that the agent cannot get stuck.

        # This is actually the EpsGreedy Restricted Policy (see top of this gist)

        self.policy = LinearAnnealedPolicy(EpsGreedyRestrictedPolicy(), attr='eps', value_max=self.startEpsilon, value_min=self.endEpsilon, value_test=.05,
                                      nb_steps=decay)

        self.policy._set_agent(self)

        if os.path.isfile(self.model_file(self.sub_version)) and \
            os.path.isfile(self.target_model_file(self.sub_version)) and \
            os.path.isfile(self.trainable_model_file(self.sub_version)):

            custom_objects = {'ClippedLoss': ClippedLoss, 'identity_loss': identity_loss, 'SumAcross': SumAcross, 'mean_q': mean_q}
            self.model = load_model(self.model_file(self.sub_version), custom_objects=custom_objects)
            self.trainable_model = load_model(self.trainable_model_file(self.sub_version), custom_objects=custom_objects)
            self.target_model = load_model(self.target_model_file(self.sub_version), custom_objects=custom_objects)
        else:
            if os.path.isfile(self.model_weights(self.sub_version)):
                print("Loading model weights...")
                self.model.load_weights(self.model_weights(self.sub_version))
            self.compile(Adam(lr=.00025, clipnorm=1., clipvalue=0.5), metrics=['mae'])

        self.model.name = "MainModel"
        self.trainable_model.name = "TrainableModel"
        self.target_model.name = "TargetModel"

        print(self.model.get_weights())

        self.callbacks.set_model(self.trainable_model)
        print(self.model.summary())

        self.train_begin()

    def load_weights(self, filepath):
        self.model.load_weights(filepath)
        self.update_target_model_hard()
        self.update_trainable_model_hard()

    def update_target_model_hard(self):
        self.target_model.set_weights(self.model.get_weights())

    def update_trainable_model_hard(self):
        self.trainable_model.set_weights(self.model.get_weights())

    def compileTrainableModel(self, optimizer, metrics=[]):
        y_pred = self.model.output
        y_true = Input(name='y_true', shape=(self.nb_actions,))
        mask = Input(name='mask', shape=(self.nb_actions,))

        clipped = ClippedLoss(delta_clip=self.delta_clip)([y_true, y_pred])
        clipped.trainable = False

        masked = Multiply()([clipped, mask])
        masked.trainable = False

        loss = SumAcross(axis=-1, name='loss')(masked)
        loss.trainable = False

        ins = [self.model.input] if type(self.model.input) is not list else self.model.input
        trainable_model = Model(inputs=ins + [y_true, mask], outputs=[loss, y_pred])

        combined_metrics = {trainable_model.output_names[1]: metrics}
        # tf.summary.scalar('trainable_model.loss', trainable_model.output[0])

        trainable_model.compile(optimizer=optimizer, loss=identity_loss, metrics=combined_metrics)
        # tf.summary.merge_all()

        if os.path.isfile(self.trainable_model_weights(self.sub_version)):
            trainable_model.load_weights(self.trainable_model_weights(self.sub_version))

        return trainable_model


    def compile(self, optimizer, metrics=[]):
        metrics += [mean_q]  # register default metrics

        # We never train the target model, hence we can set the optimizer and loss arbitrarily.
        self.target_model = clone_model(self.model, self.custom_model_objects)

        if os.path.isfile(self.target_model_weights(self.sub_version)):
            self.target_model.load_weights(self.target_model_weights)

        self.target_model.compile(optimizer='sgd', loss='mse')
        self.model.compile(optimizer='sgd', loss='mse')

        # Compile model.
        if self.target_model_update < 1.:
            # We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model.
            updates = get_soft_target_model_updates(self.target_model, self.model, self.target_model_update)
            optimizer = AdditionalUpdatesOptimizer(optimizer, updates)


        # Create trainable model. The problem is that we need to mask the output since we only
        # ever want to update the Q values for a certain action. The way we achieve this is by
        # using a custom Lambda layer that computes the loss. This gives us the necessary flexibility
        # to mask out certain parameters by passing in multiple inputs to the Lambda layer.

        self.trainable_model = self.compileTrainableModel(optimizer, metrics)

        self.compiled = True

    def update_target_model_hard(self):
        self.target_model.set_weights(self.model.get_weights())

    def startTesting(self):
        self.training = False

    def getWeights(self):
        return self.weights

    def getQValue(self, state, action):
        """
        Should return Q(state,action) = w * featureVector
        where * is thedotProduct operator
        """

        if not self.init:
            print(self.featExtractor.getFeatures(state, action))
            self.init +=1


        sum = 0
        features = self.featExtractor.getFeatures(state, action)
        for _, val in enumerate(self.featExtractor.getFeatures(state, action)):
            sum += self.weights[val] * features[val]

        return sum


    def update(self, state, action, nextState, reward):
        """
           Should update your weights based on transition
        """
        "*** YOUR CODE HERE ***"
        PacmanQAgent.update(self, state, action, nextState, reward)
        self.accumulated_reward += reward

        # if self.step % self.nb_of_times_to_repeat_action == 0 and self.training:
        self.backward(reward, nextState is None, state)
        self.increment_step()

    def increment_step(self):
        self.step += 1
        self.stepsThisEpisode += 1

    def process_observation(self, observation):
        return observation.data.asMultipleArrays()

    def process_state_batch(self, batch):
        batch = np.array(batch)
        return batch

    def compute_batch_q_values(self, state_batch):
        batch = self.process_state_batch(state_batch)
        # print(batch)
        q_values = self.model.predict_on_batch(batch)
        assert q_values.shape == (len(state_batch), self.nb_actions)
        return q_values

    def compute_q_values(self, state):
        q_values = self.compute_batch_q_values([state]).flatten()
        assert q_values.shape == (self.nb_actions,)
        return q_values

    def forward(self, state):

        # Select an action.
        observation = self.process_observation(state)
        recent_state = self.memory.get_recent_state(observation)


        recent_state = np.reshape(recent_state, self.input_shape)
        q_values = self.compute_q_values(recent_state)

        if self.training:
            legalActions = self.getLegalActions(state)
            action = self.policy.select_action(q_values=q_values, legal_actions=Actions.actionsAsIndices(legalActions))
        else:
            action = self.test_policy.select_action(q_values=q_values, legal_actions=Actions.actionsAsIndices(self.getLegalActions(state)))


            # print("Saved weights debug data to 'during' file")

        # Book-keeping.
        self.recent_observation = observation
        self.recent_action = action

        return action

    def backward(self, reward, terminal, state=None):
        # Store most recent experience in memory.
        if self.step % self.memory_interval == 0:
            self.memory.append(self.recent_observation, self.recent_action, reward, terminal,
                               training=self.training)

        metrics = [np.nan for _ in self.metrics_names]
        if not self.training:
            # We're done here. No need to update the experience memory since we only use the working
            # memory to obtain the state over the most recent observations.
            return metrics

        # Train the network on a single stochastic batch.
        try:
            if self.step > self.nb_steps_warmup and self.step % self.train_interval == 0:
                experiences = self.memory.sample(self.batch_size)


                assert len(experiences) == self.batch_size

                # Start by extracting the necessary parameters (we use a vectorized implementation).
                state0_batch = []
                reward_batch = []
                action_batch = []
                terminal1_batch = []
                state1_batch = []
                for e in experiences:
                    state0_batch.append(e.state0)
                    state1_batch.append(e.state1)
                    reward_batch.append(e.reward)
                    action_batch.append(e.action)
                    terminal1_batch.append(0. if e.terminal1 else 1.)

                # Prepare and validate parameters.
                state0_batch = self.process_state_batch(state0_batch)
                state1_batch = self.process_state_batch(state1_batch)
                terminal1_batch = np.array(terminal1_batch)
                reward_batch = np.array(reward_batch)
                assert reward_batch.shape == (self.batch_size,)
                assert terminal1_batch.shape == reward_batch.shape
                assert len(action_batch) == len(reward_batch)

                state0_batch = np.reshape(state0_batch, (self.batch_size,) + self.input_shape)
                state1_batch = np.reshape(state1_batch, (self.batch_size,) + self.input_shape)

                # Compute Q values for mini-batch update.
                if self.enable_double_dqn:
                    # According to the paper "Deep Reinforcement Learning with Double Q-learning"
                    # (van Hasselt et al., 2015), in Double DQN, the online network predicts the actions
                    # while the target network is used to estimate the Q value.
                    q_values = self.model.predict_on_batch(state1_batch)
                    assert q_values.shape == (self.batch_size, self.nb_actions)
                    actions = np.argmax(q_values, axis=1)
                    assert actions.shape == (self.batch_size,)

                    # Now, estimate Q values using the target network but select the values with the
                    # highest Q value wrt to the online model (as computed above).
                    target_q_values = self.target_model.predict_on_batch(state1_batch)
                    assert target_q_values.shape == (self.batch_size, self.nb_actions)
                    q_batch = target_q_values[range(self.batch_size), actions]
                else:
                    # Compute the q_values given state1, and extract the maximum for each sample in the batch.
                    # We perform this prediction on the target_model instead of the model for reasons
                    # outlined in Mnih (2015). In short: it makes the algorithm more stable.
                    target_q_values = self.target_model.predict_on_batch(state1_batch)
                    assert target_q_values.shape == (self.batch_size, self.nb_actions)
                    q_batch = np.max(target_q_values, axis=1).flatten()
                assert q_batch.shape == (self.batch_size,)

                targets = np.zeros((self.batch_size, self.nb_actions))
                dummy_targets = np.zeros((self.batch_size,))
                masks = np.zeros((self.batch_size, self.nb_actions))

                # Compute r_t + gamma * max_a Q(s_t+1, a) and update the target targets accordingly,
                # but only for the affected output units (as given by action_batch).
                discounted_reward_batch = self.gamma * q_batch
                # Set discounted reward to zero for all states that were terminal.
                discounted_reward_batch *= terminal1_batch
                assert discounted_reward_batch.shape == reward_batch.shape
                Rs = reward_batch + discounted_reward_batch
                for idx, (target, mask, R, action) in enumerate(zip(targets, masks, Rs, action_batch)):
                    target[action] = R  # update action with estimated accumulated reward
                    dummy_targets[idx] = R
                    mask[action] = 1.  # enable loss for this specific action
                targets = np.array(targets).astype('float32')
                masks = np.array(masks).astype('float32')

                # Finally, perform a single update on the entire batch. We use a dummy target since
                # the actual loss is computed in a Lambda layer that needs more complex input. However,
                # it is still useful to know the actual target to compute metrics properly.
                ins = [state0_batch] if type(self.model.input) is not list else state0_batch

                # self.callbacks.on_train_batch_begin(batch=ins)
                metrics = self.trainable_model.train_on_batch(ins + [targets, masks], [dummy_targets, targets])
                self.callbacks.on_train_batch_end(batch=ins, logs=metrics)

                metrics = [metric for idx, metric in enumerate(metrics) if idx not in (1, 2)]  # throw away individual losses
                metrics += self.policy.metrics

                d = dict(zip(self.trainable_model.metrics_names, metrics))
                self.epoch_metrics += [d]

                if self.processor is not None:
                    metrics += self.processor.metrics

            if self.target_model_update >= 1 and self.step % self.target_model_update == 0:
                self.update_target_model_hard()
        except AssertionError:
            pass

        return metrics

    def train_begin(self):
        self.callbacks.on_train_begin()

    def episode_begin(self):
        self.callbacks.on_epoch_begin(self.global_epoch)

    def episode_end(self):
        logs = {}

        if len(self.epoch_metrics) > 0:
            logs = {key: np.mean([entry[key] for entry in self.epoch_metrics]) for key, value in self.epoch_metrics[0].items()}

        more_logs = {"av_reward": np.mean(self.epoch_rewards), "eps": self.policy.get_current_value()}

        self.epoch_rewards = []
        self.callbacks.on_epoch_end(self.global_epoch, logs=dict(logs.items() + more_logs.items()))
        self.global_epoch += 1

    def train_end(self):
        self.callbacks.on_train_end()

    def getAction(self, state):
       if self.stepsThisEpisode == 0:
           self.episode_begin()

       if self.nb_random_start_steps == 0:
           self.nb_random_start_steps = np.random.randint(self.nb_max_start_steps)

       if self.stepsThisEpisode < self.nb_random_start_steps:
           action = np.random.choice(self.getLegalActions(state))
           self.recent_observation = self.process_observation(state)
           self.recent_action = Actions.actionsAsIndices([action])[0]
       else:
           action = self.forward(state)
           action = Actions._possibleActions[action]


       self.doAction(state, action)
       return action

    def saveEverything(self):
        # Save the model
        self.model.save(self.model_file(self.sub_version))
        self.trainable_model.save(self.trainable_model_file(self.sub_version))
        self.target_model.save(self.target_model_file(self.sub_version))

        with open(self.memory_file(self.sub_version), "w") as output:
            pickle.dump(self.memory, output, pickle.HIGHEST_PROTOCOL)

        with open(self.parameters_file(self.sub_version), "w") as output:
            pickle.dump(self.step, output, pickle.HIGHEST_PROTOCOL)
            pickle.dump(self.sub_version, output, pickle.HIGHEST_PROTOCOL)
            pickle.dump(self.global_epoch, output, pickle.HIGHEST_PROTOCOL)

        with open(self.version_file, "w") as opt:
            pickle.dump(self.sub_version, opt, pickle.HIGHEST_PROTOCOL)

        # After saving the newest state, we delete the older state to save some space...
        if self.sub_version > 0:
            try:
                os.remove(self.model_file(self.sub_version - 1))
                os.remove(self.trainable_model_file(self.sub_version - 1))
                os.remove(self.target_model_file(self.sub_version - 1))
                os.remove(self.memory_file(self.sub_version - 1))
                os.remove(self.parameters_file(self.sub_version - 1))
            except:
                print("Previous version was already deleted.")
                pass

        print("Saved Model, dumped memory and parameters to pickle file, version {}.".format(self.sub_version))
        self.sub_version += 1

    def cleanup(self, sig, frame):
        if self.training:
            self.training = False
            self.saveEverything()
            sys.exit(0)
        else:
            sys.exit(0)


    def final(self, state):
        "Called at the end of each game."
        # call the super-class final method
        PacmanQAgent.final(self, state)

        self.nb_random_start_steps = 0
        self.stepsThisEpisode = 0

        self.epoch_rewards += [self.accumulated_reward]
        self.accumulated_reward = 0
        self.episode_end()

        if self.episodesSoFar % self.nb_episodes_between_backups == 0:
            self.saveEverything()

        # print("Episode: {}".format(self.episodesSoFar))
        # did we finish training?
        if self.episodesSoFar == self.numTraining:
            self.training = False
            self.train_end()
            self.saveEverything()