Neo-X/ExperienceMemory

## ExperienceMemory

import numpy as np
import random
import h5py
from model.ModelUtil import validBounds, fixBounds, anneal_value, norm_state, norm_action, norm_reward, checkValidData
import copy
import sys


class ExperienceMemory(object):
    """
        Contains the recient history of experience tuples

        I have decided that the experience memory will contain real values from the simulation.
        Not values that have been normalize. I think this will make things easier down the road
        If I wanted to adjust the model scale now I won't have to update all the tuples in the memory.
        Also, a scale layer can be added to the model to compensate for having to scale every tuple
        when performing training updates.
    """

    def __init__(self, state_length, action_length, memory_length, continuous_actions=False, settings=None, result_state_length=None):

        if (settings == None):
            self._settings = {}
            self._settings['discount_factor'] = 0.0
            # self._settings['float_type'] = 'float32'
        else:
            self._settings = settings

        self._history_size=memory_length
        self._trajectory_size=int(memory_length/100)
        if ("fd_expereince_length" in self._settings):
            self._trajectory_size=int(self._settings["fd_expereince_length"])
        self._state_length = state_length
        self._action_length = action_length
        self._continuous_actions = continuous_actions

        if ( result_state_length == None ):
            self._result_state_length = state_length
        else:
            self._result_state_length = result_state_length
        # self._settings = settings
        self._history_update_index=0 # where the next experience should write
        self._samples=0 # Number of inserts since last clear()
        self._inserts=0 # total number of inserts
        self.clear()
        # self._state_history = theano.shared(np.zeros((self._history_size, state_length)))
        # self._action_history = theano.shared(np.zeros((self._history_size, action_length)))
        # self._nextState_history = theano.shared(np.zeros((self._history_size, state_length)))
        # self._reward_history = theano.shared(np.zeros((self._history_size, 1)))

    def clear(self):
        self._history_update_index=0 # where the next experience should write
        self._samples=0 ## How many samples are in the buffer

        if (self._settings['float_type'] == 'float32'):

            self._state_history = (np.zeros((self._history_size, self._state_length), dtype='float32'))
            if self._continuous_actions:
                self._action_history = (np.zeros((self._history_size, self._action_length), dtype='float32'))
            else:
                self._action_history = (np.zeros((self._history_size, self._action_length), dtype='int8'))
            self._nextState_history = (np.zeros((self._history_size, self._result_state_length), dtype='float32'))
            self._reward_history = (np.zeros((self._history_size, 1), dtype='float32'))
            self._fall_history = (np.zeros((self._history_size, 1), dtype='int8'))
            self._discounted_sum_history = (np.zeros((self._history_size, 1), dtype='float32'))
            self._advantage_history = (np.zeros((self._history_size, 1), dtype='float32'))
            self._exp_action_history = (np.zeros((self._history_size, 1), dtype='int8'))
        else:
            self._state_history = (np.zeros((self._history_size, self._state_length), dtype='float64'))
            if self._continuous_actions:
                self._action_history = (np.zeros((self._history_size, self._action_length), dtype='float64'))
            else:
                self._action_history = (np.zeros((self._history_size, self._action_length), dtype='int8'))
            self._nextState_history = (np.zeros((self._history_size, self._result_state_length), dtype='float64'))
            self._reward_history = (np.zeros((self._history_size, 1), dtype='float64'))
            self._fall_history = (np.zeros((self._history_size, 1), dtype='int8'))
            self._discounted_sum_history = (np.zeros((self._history_size, 1), dtype='float64'))
            self._advantage_history = (np.zeros((self._history_size, 1), dtype='float64'))
            self._exp_action_history = (np.zeros((self._history_size, 1), dtype='int8'))

        self._trajectory_history = [None] * self._trajectory_size
        self._samplesTrajectory = 0
        self._insertsTrajectory = 0
        self._trajectory_update_index = 0

    def insertsTrajectory(self):
        return self._insertsTrajectory
    def samplesTrajectory(self):
        return self._samplesTrajectory

    def history_size_Trajectory(self):
        return self._trajectory_size

    def _insertTrajectory(self, trajectory):

        if ( (self._trajectory_update_index >= (self.history_size_Trajectory()) ) ):
            self._trajectory_update_index=0
            # print("Reset history index in exp buffer:")

        self._trajectory_history[self._trajectory_update_index] = trajectory

        self._insertsTrajectory+=1
        self._trajectory_update_index+=1
        self._samplesTrajectory+=1

    def insertTrajectory(self, states, actions, result_states, rewards, falls, G_ts, advantage, exp_actions):

        self._insertTrajectory([states, actions, result_states, rewards, falls, G_ts, advantage, exp_actions])

    def get_multitask_trajectory_batch(self, batch_size=4, excludeActionTypes=[]):

        state_, action_, resultState_, reward_, fall_, G_ts_, exp_actions_, advantage_ = self.get_trajectory_batch(batch_size=4, cast=False)

        ### Find length of shortest trajectory...
        shortest_traj = 10000000
        for t in range(len(state_)):
            if len(state_[t]) < shortest_traj:
                shortest_traj = len(state_[t])

        ### Make all trajectories as long as the shortest one...
        for t in range(len(state_)):
            state_[t] = state_[t][:shortest_traj]
            action_[t] = action_[t][:shortest_traj]
            resultState_[t] = resultState_[t][:shortest_traj]
            reward_[t] = reward_[t][:shortest_traj]
            fall_[t] = fall_[t][:shortest_traj]
            G_ts_[t] = G_ts_[t][:shortest_traj]
            exp_actions_[t] = exp_actions_[t][:shortest_traj]
            advantage_[t] = advantage_[t][:shortest_traj]

        state_ = np.array(state_, dtype=self._settings['float_type'])
        if (self._continuous_actions):
            action_ = np.array(action_, dtype=self._settings['float_type'])
        else:
            action_ = np.array(action_, dtype='int8')
        resultState_ = np.array(resultState_, dtype=self._settings['float_type'])
        reward_ = np.array(reward_, dtype=self._settings['float_type'])
        G_ts_ = np.array(G_ts_, dtype=self._settings['float_type'])
        advantage_ = np.array(advantage_, dtype=self._settings['float_type'])

        fall_ = np.array(fall_, dtype='int8')
        exp_actions_ = np.array(exp_actions_, dtype='int8')

        return (state_, action_, resultState_, reward_, fall_, G_ts_, exp_actions_, advantage_)

    def get_trajectory_batch(self, batch_size=4, excludeActionTypes=[], cast=True):
        """
        len(experience > batch_size
        """
        # assert batch_size <= self._history_size, "batch_size <= self._history_size: " + str(batch_size) +" <=  " + str(self._history_size)
        assert batch_size <= self.samplesTrajectory(), "batch_size <= self.samplesTrajectory(): " + str(batch_size) +" <=  " + str(self.samplesTrajectory())
        # indices = list(nprnd.randint(low=0, high=len(experience), size=batch_size))
        max_size = min(self.history_size_Trajectory(), self.samplesTrajectory())
        # print ("Indicies: " , indices)
        # print("Exp buff state bounds: ", self.getStateBounds())

        state = []
        action = []
        resultState = []
        reward = []
        fall = []
        G_ts = []
        exp_actions = []
        advantage = []
        indices = set([])
        trys = 0
        ### collect batch and try at most 3 times the batch size for valid tuples
        while len(indices) <  batch_size and (trys < batch_size*3):
        # for i in indices:
            trys = trys + 1
            i = (random.sample(set(range(0, max_size))-indices, 1))[0]
            ## skip tuples that were not exploration actions
            if ( self._exp_action_history[i] in excludeActionTypes):
                continue
            indices.add(i)
            assert self._trajectory_history[i] != None, "self._trajectory_history["+str(i)+"] != None: " + str(self._trajectory_history[i]) + " state shape: " + str(np.asarray(state).shape)
            # print ("states shape: ", np.array(self._trajectory_history[i][0]))
            # print ("states bounds shape: ", np.array(self.getStateBounds()))
            state.append(norm_state(self._trajectory_history[i][0], self.getStateBounds()))
            # print("Action pulled out: ", self._action_history[i])
            action.append(norm_action(self._trajectory_history[i][1], self.getActionBounds())) # won't work for discrete actions...
            resultState.append(norm_state(self._trajectory_history[i][2], self.getResultStateBounds()))
            reward.append(norm_state(self._trajectory_history[i][3] , self.getRewardBounds() ) * ((1.0-self._settings['discount_factor']))) # scale rewards
            fall.append(self._trajectory_history[i][4])
            G_ts.append(self._trajectory_history[i][5])
            advantage.append(self._trajectory_history[i][6])
            exp_actions.append(self._trajectory_history[i][7])

        # print c
        # print experience[indices]
        ### All sequences must be the same length for this to work
        if (cast):
            state = np.array(state, dtype=self._settings['float_type'])
            if (self._continuous_actions):
                action = np.array(action, dtype=self._settings['float_type'])
            else:
                action = np.array(action, dtype='int8')
            resultState = np.array(resultState, dtype=self._settings['float_type'])
            reward = np.array(reward, dtype=self._settings['float_type'])
            G_ts = np.array(G_ts, dtype=self._settings['float_type'])
            advantage = np.array(advantage, dtype=self._settings['float_type'])

            fall = np.array(fall, dtype='int8')
            exp_actions = np.array(exp_actions, dtype='int8')

        # assert state.shape == (len(indices), self._state_length), "state.shape == (len(indices), self._state_length): " + str(state.shape) + " == " + str((len(indices), self._state_length))
        # assert action.shape == (len(indices), self._action_length), "action.shape == (len(indices), self._action_length): " + str(action.shape) + " == " + str((len(indices), self._action_length))
        # assert resultState.shape == (len(indices), self._result_state_length), "resultState.shape == (len(indices), self._result_state_length): " + str(resultState.shape) + " == " + str((len(indices), self._result_state_length))
        # assert reward.shape == (len(indices), 1), "reward.shape == (len(indices), 1): " + str(reward.shape) + " == " + str((len(indices), 1))
        # assert G_ts.shape == (len(indices), 1), "G_ts.shape == (len(indices), 1): " + str(G_ts.shape) + " == " + str((len(indices), 1))
        # assert fall.shape == (len(indices), 1), "fall.shape == (len(indices), 1): " + str(fall.shape) + " == " + str((len(indices), 1))
        # assert exp_actions.shape == (len(indices), 1), "exp_actions.shape == (len(indices), 1): " + str(exp_actions.shape) + " == " + str((len(indices), 1))
        # assert advantage.shape == (len(indices), 1), "G_ts.shape == (len(indices), 1): " + str(advantage.shape) + " == " + str((len(indices), 1))
        # assert len(np.unique(indices)[0]) == batch_size, "np.unique(indices).shape[0] == batch_size: " + str(np.unique(indices).shape[0]) + " == " + str(batch_size)

        return (state, action, resultState, reward, fall, G_ts, exp_actions, advantage)


    def insertTuple(self, tuple):

        (state, action, nextState, reward, fall, G_t, exp_action, advantage) = tuple
        self.insert(state, action, nextState, reward, fall, G_t, exp_action, advantage)

    def insert(self, state, action, nextState, reward, fall=[[0]], G_t=[[0]], exp_action=[[0]], advantage=[[0]]):
        # print "Instert State: " + str(state)
        # state = list(state)
        assert len(state[0]) == self._state_length, "len(state[0]) == self._state_length: " + str(self._state_length) + " state shape: " + str(np.asarray(state).shape)
        assert len(action[0]) == self._action_length, "len(action[0]) == self._action_length: " + str(action)
        assert len(nextState[0]) == self._result_state_length, "len(nextState[0]) == self._result_state shape: " + str(self._result_state_length) + " == " + str(np.asarray(nextState).shape)
        assert len(reward[0]) == 1
        assert len(fall[0]) == 1
        assert len(G_t[0]) == 1
        assert len(exp_action[0]) == 1
        """
        state = list(state)
        action = list(action)
        nextState = list(nextState)
        reward = list(reward)
        nums = state+action+nextState+reward
        """

        if ( checkValidData(state, action, nextState, reward) == False ):
            print ("Failed inserting bad tuple: ")
            return

        if ( (self._history_update_index >= (self._history_size) )):
            self._history_update_index=0
            # print("Reset history index in exp buffer:")

        # print ("Tuple: " + str(state) + ", " + str(action) + ", " + str(nextState) + ", " + str(reward))
        # print ("action type: ", self._action_history.dtype)
        self._state_history[self._history_update_index] = copy.deepcopy(np.array(state))
        self._action_history[self._history_update_index] = copy.deepcopy(np.array(action))
        # print("inserted action: ", self._action_history[self._history_update_index])
        self._nextState_history[self._history_update_index] = copy.deepcopy(np.array(nextState))
        self._reward_history[self._history_update_index] = copy.deepcopy(np.array(reward))
        self._fall_history[self._history_update_index] = copy.deepcopy(np.array(fall))
        self._discounted_sum_history[self._history_update_index] = copy.deepcopy(np.array(G_t))
        self._advantage_history[self._history_update_index] = copy.deepcopy(np.array(advantage))
        self._exp_action_history[self._history_update_index] = copy.deepcopy(np.array(exp_action))
        # print ("fall: ", fall)
        # print ("self._fall_history: ", self._fall_history[self._history_update_index])

        self._inserts+=1
        self._history_update_index+=1
        self._samples+=1
        self.updateScalling(state, action, nextState, reward)

    def inserts(self):
        return self._inserts
    def samples(self):
        return self._samples

    def history_size(self):
        return self._history_size

    def updateScalling(self, state, action, nextState, reward):

        if (self.inserts() == 1):
            self._state_mean =  self._state_history[0]
            self._state_var = np.zeros_like(state)

            self._reward_mean =  self._reward_history[0]
            self._reward_var = np.zeros_like(reward)

            self._action_mean =  self._action_history[0]
            self._action_var = np.zeros_like(action)
        else:
            x_mean_old = self._state_mean
            self._state_mean = self._state_mean + ((state - self._state_mean)/self.inserts())

            reward_mean_old = self._reward_mean
            self._reward_mean = self._reward_mean + ((reward - self._reward_mean)/self.inserts())

            action_mean_old = self._action_mean
            self._action_mean = self._action_mean + ((action - self._action_mean)/self.inserts())

        if ( self.inserts() == 2):
            self._state_var = (self._state_history[1] - ((self._state_history[0]+self._state_history[1])/2.0)**2)/2.0
            self._reward_var = (self._reward_history[1] - ((self._reward_history[0]+self._reward_history[1])/2.0)**2)/2.0
            self._action_var = (self._action_history[1] - ((self._action_history[0]+self._action_history[1])/2.0)**2)/2.0

        elif (self.inserts() > 2):
            self._state_var = (((self.inserts()-2)*self._state_var) + ((self.inserts()-1)*(x_mean_old - self._state_mean)**2) + ((state - self._state_mean)**2))
            self._state_var = (self._state_var/float(self.inserts()-1))

            self._reward_var = (((self.inserts()-2)*self._reward_var) + ((self.inserts()-1)*(reward_mean_old - self._reward_mean)**2) + ((reward - self._reward_mean)**2))
            self._reward_var = (self._reward_var/float(self.inserts()-1))

            self._action_var = (((self.inserts()-2)*self._action_var) + ((self.inserts()-1)*(action_mean_old - self._action_mean)**2) + ((action - self._action_mean)**2))
            self._action_var = (self._action_var/float(self.inserts()-1))

        # if ( 'state_normalization' in self._settings and self._settings["state_normalization"] == "adaptive"):
        #     self._updateScaling()

    def _updateScaling(self):

        scale_factor = 1.0
        # state_std = np.maximum(np.sqrt(self._state_var[0]), 0.05)
        state_std = np.sqrt(self._state_var[0])
        # print("Running mean: ", self._state_mean)
        # print("Running std: ", state_std)
        low = self._state_mean[0] - (state_std*scale_factor)
        high = self._state_mean[0] + (state_std*scale_factor)
        # self.setStateBounds(np.array([low,high]))
        self.setStateBounds(fixBounds(np.array([low,high])))

        # print("New scaling parameters: ", self.getStateBounds())

        # print("Running reward mean: ", self._reward_mean)
        # print("Running reward std: ", np.sqrt(self._reward_var))
        low = self._reward_mean[0] - (np.sqrt(self._reward_var[0])*scale_factor)
        high = self._reward_mean[0] + (np.sqrt(self._reward_var[0])*scale_factor)
        self.setRewardBounds(np.array([low,high]))
        # print("New scaling parameters: ", self.getStateBounds())
        """
        low = self._action_mean[0] - np.sqrt(self._action_var[0])
        high = self._action_mean[0] + np.sqrt(self._action_var[0])
        self.setActionBounds(np.array([low,high]))
        """

    def get_exporation_action_batch(self, batch_size=32):
        return self.get_batch(batch_size=batch_size, excludeActionTypes=[0])

    def getNonMBAEBatch(self, batch_size=32):
        """
            Avoids training critic on MBAE actions.
        """
        return self.get_batch(batch_size=batch_size, excludeActionTypes=[2])

    def get_batch(self, batch_size=32, excludeActionTypes=[]):
        """
        len(experience > batch_size
        """
        # assert batch_size <= self._history_size, "batch_size <= self._history_size: " + str(batch_size) +" <=  " + str(self._history_size)
        assert batch_size <= self.samples(), "batch_size <= self.samples(): " + str(batch_size) +" <=  " + str(self.samples())
        # indices = list(nprnd.randint(low=0, high=len(experience), size=batch_size))
        max_size = min(self._history_size, self.samples())
        # print ("Indicies: " , indices)
        # print("Exp buff state bounds: ", self.getStateBounds())

        state = []
        action = []
        resultState = []
        reward = []
        fall = []
        G_ts = []
        exp_actions = []
        advantage = []
        indices = set([])
        trys = 0
        ### collect batch and try at most 3 times the batch size for valid tuples
        while len(indices) <  batch_size and (trys < batch_size*5):
        # for i in indices:
            trys = trys + 1
            i = (random.sample(set(range(0, max_size))-indices, 1))[0]
            ## skip tuples that were not exploration actions
            if ( self._exp_action_history[i] in excludeActionTypes):
                continue
            ### Or if multitasking and only want to train policy on single task
            # print ("self._fall_history[i]: ", self._fall_history[i])
            if ( (type(self._settings["sim_config_file"]) is list)):

                if ("worker_to_task_mapping" in self._settings
                     and (self._settings["worker_to_task_mapping"][self._fall_history[i][0]] is not 0)):
                    # print ("skipping non desired task tuple")
                    continue
            indices.add(i)

            if ( ('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])):
                # state.append(self._state_history[i])
                state.append(norm_state(self._state_history[i], self.getStateBounds()))
                # print("Action pulled out: ", self._action_history[i])
                action.append(self._action_history[i]) # won't work for discrete actions...
                # action.append(norm_action(self._action_history[i], self.getActionBounds())) # won't work for discrete actions...
                resultState.append(norm_state(self._nextState_history[i], self.getResultStateBounds()))
                # resultState.append(self._nextState_history[i])
                reward.append(norm_state(self._reward_history[i] , self.getRewardBounds()) * ((1.0-self._settings['discount_factor']))) # scale rewards
            else:

                state.append(norm_state(self._state_history[i], self.getStateBounds()))
                # print("Action pulled out: ", self._action_history[i])
                action.append(norm_action(self._action_history[i], self.getActionBounds())) # won't work for discrete actions...
                resultState.append(norm_state(self._nextState_history[i], self.getResultStateBounds()))
                reward.append(norm_state(self._reward_history[i] , self.getRewardBounds() ) * ((1.0-self._settings['discount_factor']))) # scale rewards
            fall.append(self._fall_history[i])
            G_ts.append(self._discounted_sum_history[i])
            advantage.append(self._advantage_history[i])
            exp_actions.append(self._exp_action_history[i])

        # print c
        # print experience[indices]
        if (self._settings['float_type'] == 'float32'):
            state = np.array(state, dtype='float32')
            if (self._continuous_actions):
                action = np.array(action, dtype='float32')
            else:
                action = np.array(action, dtype='int8')
            resultState = np.array(resultState, dtype='float32')
            reward = np.array(reward, dtype='float32')
            # fall = np.array(fall, dtype='int8')
            G_ts = np.array(G_ts, dtype='float32')
            advantage = np.array(advantage, dtype='float32')
        else:
            state = np.array(state, dtype='float64')
            if (self._continuous_actions):
                action = np.array(action, dtype='float64')
            else:
                action = np.array(action, dtype='int8')
            resultState = np.array(resultState, dtype='float64')
            reward = np.array(reward, dtype='float64')
            G_ts = np.array(G_ts, dtype='float64')
            advantage = np.array(advantage, dtype='float32')

        fall = np.array(fall, dtype='int8')
        exp_actions = np.array(exp_actions, dtype='int8')

        assert state.shape == (len(indices), self._state_length), "state.shape == (len(indices), self._state_length): " + str(state.shape) + " == " + str((len(indices), self._state_length))
        assert action.shape == (len(indices), self._action_length), "action.shape == (len(indices), self._action_length): " + str(action.shape) + " == " + str((len(indices), self._action_length))
        assert resultState.shape == (len(indices), self._result_state_length), "resultState.shape == (len(indices), self._result_state_length): " + str(resultState.shape) + " == " + str((len(indices), self._result_state_length))
        assert reward.shape == (len(indices), 1), "reward.shape == (len(indices), 1): " + str(reward.shape) + " == " + str((len(indices), 1))
        assert G_ts.shape == (len(indices), 1), "G_ts.shape == (len(indices), 1): " + str(G_ts.shape) + " == " + str((len(indices), 1))
        assert fall.shape == (len(indices), 1), "fall.shape == (len(indices), 1): " + str(fall.shape) + " == " + str((len(indices), 1))
        assert exp_actions.shape == (len(indices), 1), "exp_actions.shape == (len(indices), 1): " + str(exp_actions.shape) + " == " + str((len(indices), 1))
        assert advantage.shape == (len(indices), 1), "G_ts.shape == (len(indices), 1): " + str(advantage.shape) + " == " + str((len(indices), 1))
        # assert len(np.unique(indices)[0]) == batch_size, "np.unique(indices).shape[0] == batch_size: " + str(np.unique(indices).shape[0]) + " == " + str(batch_size)

        return (state, action, resultState, reward, fall, G_ts, exp_actions, advantage)

    def setStateBounds(self, _state_bounds):
        self._state_bounds = _state_bounds
        self.setResultStateBounds(_state_bounds)

    def setRewardBounds(self, _reward_bounds):
        self._reward_bounds = _reward_bounds
    def setActionBounds(self, _action_bounds):
        self._action_bounds = _action_bounds
    def setResultStateBounds(self, _result_state_bounds):
        self._result_state_bounds = _result_state_bounds

    def getStateBounds(self):
        return self._state_bounds
    def getRewardBounds(self):
        return self._reward_bounds
    def getActionBounds(self):
        return self._action_bounds
    def getResultStateBounds(self):
        return self._result_state_bounds

    def setSettings(self, settings):
        self._settings = settings
    def getSettings(self):
        return self._settings

    def saveToFile(self, filename):
        hf = h5py.File(filename, "w")
        hf.create_dataset('_state_history', data=self._state_history)
        hf.create_dataset('_action_history', data=self._action_history)
        hf.create_dataset('_next_state_history', data=self._nextState_history)
        hf.create_dataset('_reward_history', data=self._reward_history)
        hf.create_dataset('_fall_history', data=self._fall_history)
        hf.create_dataset('_discounted_sum_history', data=self._discounted_sum_history)
        hf.create_dataset('_advantage_history', data=self._advantage_history)
        hf.create_dataset('_exp_action_history', data=self._exp_action_history)

        hf.create_dataset('_history_size', data=[self._history_size])
        hf.create_dataset('_history_update_index', data=[self._history_update_index])
        hf.create_dataset('_inserts', data=[self._inserts])
        hf.create_dataset('_samples', data=[self._samples])
        hf.create_dataset('_state_length', data=[self._state_length])
        hf.create_dataset('_action_length', data=[self._action_length])
        hf.create_dataset('_result_state_length', data=[self._result_state_length])
        hf.create_dataset('_state_bounds', data=self._state_bounds)
        hf.create_dataset('_reward_bounds', data=self._reward_bounds)
        hf.create_dataset('_action_bounds', data=self._action_bounds)
        hf.create_dataset('_result_state_bounds', data=self._result_state_bounds)

        ### Adaptive scaling values
        hf.create_dataset('_state_mean', data=self._state_mean)
        hf.create_dataset('_state_var', data=self._state_var)
        hf.create_dataset('_reward_mean', data=self._reward_mean)
        hf.create_dataset('_reward_var', data=self._reward_var)
        hf.create_dataset('_action_mean', data=self._action_mean)
        hf.create_dataset('_action_var', data=self._action_var)


        ### Save a variable length list of data
        # data = np.array(self._trajectory_history, dtype=object)
        if ((("train_LSTM_FD" in self._settings)
            and (self._settings["train_LSTM_FD"] == True))
            or
            (("train_LSTM_Reward" in self._settings)
            and (self._settings["train_LSTM_Reward"] == True))
            ):
            grp = hf.create_group('trajectories')
            if (self._settings["print_levels"][self._settings["print_level"]] >= self._settings["print_levels"]['train']):
                print ("Saving trajectory data")
            for i in range(min(self.history_size_Trajectory(), self.samplesTrajectory())):
                list = self._trajectory_history[i]
                # print (i,list)
                if (list is not None):
                    grp_ = grp.create_group('traj'+str(i))
                    for it in range(len(list)):
                        grp_.create_dataset(str(it),data=np.array(list[it]))
                else:
                    break

            hf.create_dataset('_trajectory_size', data=[self._trajectory_size])
            hf.create_dataset('_trajectory_update_index', data=[self._trajectory_update_index])
            hf.create_dataset('_insertsTrajectory', data=[self._insertsTrajectory])
            hf.create_dataset('_samplesTrajectory', data=[self._samplesTrajectory])

        hf.flush()
        hf.close()

    def loadFromFile(self, filename):
        hf = h5py.File(filename,'r')
        self._state_history = np.array(hf.get('_state_history'))
        self._action_history= np.array(hf.get('_action_history'))
        self._nextState_history = np.array(hf.get('_next_state_history'))
        self._reward_history = np.array(hf.get('_reward_history'))
        self._fall_history = np.array(hf.get('_fall_history'))
        self._discounted_sum_history = np.array(hf.get('_discounted_sum_history'))
        self._advantage_history = np.array(hf.get('_advantage_history'))
        self._exp_action_history = np.array(hf.get('_exp_action_history'))

        self._history_size = int(hf.get('_history_size')[()])
        self._history_update_index = int(hf.get('_history_update_index')[()])
        self._inserts = int(hf.get('_inserts')[()])
        self._samples = int(hf.get('_samples')[()])
        self._state_length = int(hf.get('_state_length')[()])
        self._action_length = int(hf.get('_action_length')[()])
        self._result_state_length = int(hf.get('_result_state_length')[()])
        self._state_bounds = np.array(hf.get('_state_bounds'))
        self._reward_bounds = np.array(hf.get('_reward_bounds'))
        self._action_bounds = np.array(hf.get('_action_bounds'))
        self._result_state_bounds = np.array(hf.get('_result_state_bounds'))

        ### Adaptive scaling values
        self._state_mean = np.array(hf.get('_state_mean'))
        self._state_var = np.array(hf.get('_state_var'))
        self._reward_mean = np.array(hf.get('_reward_mean'))
        self._reward_var = np.array(hf.get('_reward_var'))
        self._action_mean = np.array(hf.get('_action_mean'))
        self._action_var = np.array(hf.get('_action_var'))

        if ((("train_LSTM_FD" in self._settings)
                and (self._settings["train_LSTM_FD"] == True))
            or
            (("train_LSTM_Reward" in self._settings)
                and (self._settings["train_LSTM_Reward"] == True))
            ):
            self._trajectory_size = int(hf.get('_trajectory_size')[()])
            self._trajectory_update_index = int(hf.get('_trajectory_update_index')[()])
            self._insertsTrajectory = int(hf.get('_insertsTrajectory')[()])
            self._samplesTrajectory = int(hf.get('_samplesTrajectory')[()])


            grp = hf.get('trajectories')
            if (self._settings["print_levels"][self._settings["print_level"]] >= self._settings["print_levels"]['train']):
                print ("Loading trajectory data")
            for i in range(min(self.history_size_Trajectory(), self.samplesTrajectory())):
                # print (i)
                traj = []
                grp_ = grp.get('traj'+str(i))
                for it in range(8):
                    traj.append(np.array(grp_.get(str(it))))

                self._trajectory_history[i] = traj

        hf.close()