Skip to content

Instantly share code, notes, and snippets.

@Neo-X
Created September 24, 2018 18:18
Show Gist options
  • Save Neo-X/e72f40966789cb0bbd0b89d07c8036cb to your computer and use it in GitHub Desktop.
Save Neo-X/e72f40966789cb0bbd0b89d07c8036cb to your computer and use it in GitHub Desktop.
experience memory buffer for RL
import numpy as np
import random
import h5py
from model.ModelUtil import validBounds, fixBounds, anneal_value, norm_state, norm_action, norm_reward, checkValidData
import copy
import sys
class ExperienceMemory(object):
"""
Contains the recient history of experience tuples
I have decided that the experience memory will contain real values from the simulation.
Not values that have been normalize. I think this will make things easier down the road
If I wanted to adjust the model scale now I won't have to update all the tuples in the memory.
Also, a scale layer can be added to the model to compensate for having to scale every tuple
when performing training updates.
"""
def __init__(self, state_length, action_length, memory_length, continuous_actions=False, settings=None, result_state_length=None):
if (settings == None):
self._settings = {}
self._settings['discount_factor'] = 0.0
# self._settings['float_type'] = 'float32'
else:
self._settings = settings
self._history_size=memory_length
self._trajectory_size=int(memory_length/100)
if ("fd_expereince_length" in self._settings):
self._trajectory_size=int(self._settings["fd_expereince_length"])
self._state_length = state_length
self._action_length = action_length
self._continuous_actions = continuous_actions
if ( result_state_length == None ):
self._result_state_length = state_length
else:
self._result_state_length = result_state_length
# self._settings = settings
self._history_update_index=0 # where the next experience should write
self._samples=0 # Number of inserts since last clear()
self._inserts=0 # total number of inserts
self.clear()
# self._state_history = theano.shared(np.zeros((self._history_size, state_length)))
# self._action_history = theano.shared(np.zeros((self._history_size, action_length)))
# self._nextState_history = theano.shared(np.zeros((self._history_size, state_length)))
# self._reward_history = theano.shared(np.zeros((self._history_size, 1)))
def clear(self):
self._history_update_index=0 # where the next experience should write
self._samples=0 ## How many samples are in the buffer
if (self._settings['float_type'] == 'float32'):
self._state_history = (np.zeros((self._history_size, self._state_length), dtype='float32'))
if self._continuous_actions:
self._action_history = (np.zeros((self._history_size, self._action_length), dtype='float32'))
else:
self._action_history = (np.zeros((self._history_size, self._action_length), dtype='int8'))
self._nextState_history = (np.zeros((self._history_size, self._result_state_length), dtype='float32'))
self._reward_history = (np.zeros((self._history_size, 1), dtype='float32'))
self._fall_history = (np.zeros((self._history_size, 1), dtype='int8'))
self._discounted_sum_history = (np.zeros((self._history_size, 1), dtype='float32'))
self._advantage_history = (np.zeros((self._history_size, 1), dtype='float32'))
self._exp_action_history = (np.zeros((self._history_size, 1), dtype='int8'))
else:
self._state_history = (np.zeros((self._history_size, self._state_length), dtype='float64'))
if self._continuous_actions:
self._action_history = (np.zeros((self._history_size, self._action_length), dtype='float64'))
else:
self._action_history = (np.zeros((self._history_size, self._action_length), dtype='int8'))
self._nextState_history = (np.zeros((self._history_size, self._result_state_length), dtype='float64'))
self._reward_history = (np.zeros((self._history_size, 1), dtype='float64'))
self._fall_history = (np.zeros((self._history_size, 1), dtype='int8'))
self._discounted_sum_history = (np.zeros((self._history_size, 1), dtype='float64'))
self._advantage_history = (np.zeros((self._history_size, 1), dtype='float64'))
self._exp_action_history = (np.zeros((self._history_size, 1), dtype='int8'))
self._trajectory_history = [None] * self._trajectory_size
self._samplesTrajectory = 0
self._insertsTrajectory = 0
self._trajectory_update_index = 0
def insertsTrajectory(self):
return self._insertsTrajectory
def samplesTrajectory(self):
return self._samplesTrajectory
def history_size_Trajectory(self):
return self._trajectory_size
def _insertTrajectory(self, trajectory):
if ( (self._trajectory_update_index >= (self.history_size_Trajectory()) ) ):
self._trajectory_update_index=0
# print("Reset history index in exp buffer:")
self._trajectory_history[self._trajectory_update_index] = trajectory
self._insertsTrajectory+=1
self._trajectory_update_index+=1
self._samplesTrajectory+=1
def insertTrajectory(self, states, actions, result_states, rewards, falls, G_ts, advantage, exp_actions):
self._insertTrajectory([states, actions, result_states, rewards, falls, G_ts, advantage, exp_actions])
def get_multitask_trajectory_batch(self, batch_size=4, excludeActionTypes=[]):
state_, action_, resultState_, reward_, fall_, G_ts_, exp_actions_, advantage_ = self.get_trajectory_batch(batch_size=4, cast=False)
### Find length of shortest trajectory...
shortest_traj = 10000000
for t in range(len(state_)):
if len(state_[t]) < shortest_traj:
shortest_traj = len(state_[t])
### Make all trajectories as long as the shortest one...
for t in range(len(state_)):
state_[t] = state_[t][:shortest_traj]
action_[t] = action_[t][:shortest_traj]
resultState_[t] = resultState_[t][:shortest_traj]
reward_[t] = reward_[t][:shortest_traj]
fall_[t] = fall_[t][:shortest_traj]
G_ts_[t] = G_ts_[t][:shortest_traj]
exp_actions_[t] = exp_actions_[t][:shortest_traj]
advantage_[t] = advantage_[t][:shortest_traj]
state_ = np.array(state_, dtype=self._settings['float_type'])
if (self._continuous_actions):
action_ = np.array(action_, dtype=self._settings['float_type'])
else:
action_ = np.array(action_, dtype='int8')
resultState_ = np.array(resultState_, dtype=self._settings['float_type'])
reward_ = np.array(reward_, dtype=self._settings['float_type'])
G_ts_ = np.array(G_ts_, dtype=self._settings['float_type'])
advantage_ = np.array(advantage_, dtype=self._settings['float_type'])
fall_ = np.array(fall_, dtype='int8')
exp_actions_ = np.array(exp_actions_, dtype='int8')
return (state_, action_, resultState_, reward_, fall_, G_ts_, exp_actions_, advantage_)
def get_trajectory_batch(self, batch_size=4, excludeActionTypes=[], cast=True):
"""
len(experience > batch_size
"""
# assert batch_size <= self._history_size, "batch_size <= self._history_size: " + str(batch_size) +" <= " + str(self._history_size)
assert batch_size <= self.samplesTrajectory(), "batch_size <= self.samplesTrajectory(): " + str(batch_size) +" <= " + str(self.samplesTrajectory())
# indices = list(nprnd.randint(low=0, high=len(experience), size=batch_size))
max_size = min(self.history_size_Trajectory(), self.samplesTrajectory())
# print ("Indicies: " , indices)
# print("Exp buff state bounds: ", self.getStateBounds())
state = []
action = []
resultState = []
reward = []
fall = []
G_ts = []
exp_actions = []
advantage = []
indices = set([])
trys = 0
### collect batch and try at most 3 times the batch size for valid tuples
while len(indices) < batch_size and (trys < batch_size*3):
# for i in indices:
trys = trys + 1
i = (random.sample(set(range(0, max_size))-indices, 1))[0]
## skip tuples that were not exploration actions
if ( self._exp_action_history[i] in excludeActionTypes):
continue
indices.add(i)
assert self._trajectory_history[i] != None, "self._trajectory_history["+str(i)+"] != None: " + str(self._trajectory_history[i]) + " state shape: " + str(np.asarray(state).shape)
# print ("states shape: ", np.array(self._trajectory_history[i][0]))
# print ("states bounds shape: ", np.array(self.getStateBounds()))
state.append(norm_state(self._trajectory_history[i][0], self.getStateBounds()))
# print("Action pulled out: ", self._action_history[i])
action.append(norm_action(self._trajectory_history[i][1], self.getActionBounds())) # won't work for discrete actions...
resultState.append(norm_state(self._trajectory_history[i][2], self.getResultStateBounds()))
reward.append(norm_state(self._trajectory_history[i][3] , self.getRewardBounds() ) * ((1.0-self._settings['discount_factor']))) # scale rewards
fall.append(self._trajectory_history[i][4])
G_ts.append(self._trajectory_history[i][5])
advantage.append(self._trajectory_history[i][6])
exp_actions.append(self._trajectory_history[i][7])
# print c
# print experience[indices]
### All sequences must be the same length for this to work
if (cast):
state = np.array(state, dtype=self._settings['float_type'])
if (self._continuous_actions):
action = np.array(action, dtype=self._settings['float_type'])
else:
action = np.array(action, dtype='int8')
resultState = np.array(resultState, dtype=self._settings['float_type'])
reward = np.array(reward, dtype=self._settings['float_type'])
G_ts = np.array(G_ts, dtype=self._settings['float_type'])
advantage = np.array(advantage, dtype=self._settings['float_type'])
fall = np.array(fall, dtype='int8')
exp_actions = np.array(exp_actions, dtype='int8')
# assert state.shape == (len(indices), self._state_length), "state.shape == (len(indices), self._state_length): " + str(state.shape) + " == " + str((len(indices), self._state_length))
# assert action.shape == (len(indices), self._action_length), "action.shape == (len(indices), self._action_length): " + str(action.shape) + " == " + str((len(indices), self._action_length))
# assert resultState.shape == (len(indices), self._result_state_length), "resultState.shape == (len(indices), self._result_state_length): " + str(resultState.shape) + " == " + str((len(indices), self._result_state_length))
# assert reward.shape == (len(indices), 1), "reward.shape == (len(indices), 1): " + str(reward.shape) + " == " + str((len(indices), 1))
# assert G_ts.shape == (len(indices), 1), "G_ts.shape == (len(indices), 1): " + str(G_ts.shape) + " == " + str((len(indices), 1))
# assert fall.shape == (len(indices), 1), "fall.shape == (len(indices), 1): " + str(fall.shape) + " == " + str((len(indices), 1))
# assert exp_actions.shape == (len(indices), 1), "exp_actions.shape == (len(indices), 1): " + str(exp_actions.shape) + " == " + str((len(indices), 1))
# assert advantage.shape == (len(indices), 1), "G_ts.shape == (len(indices), 1): " + str(advantage.shape) + " == " + str((len(indices), 1))
# assert len(np.unique(indices)[0]) == batch_size, "np.unique(indices).shape[0] == batch_size: " + str(np.unique(indices).shape[0]) + " == " + str(batch_size)
return (state, action, resultState, reward, fall, G_ts, exp_actions, advantage)
def insertTuple(self, tuple):
(state, action, nextState, reward, fall, G_t, exp_action, advantage) = tuple
self.insert(state, action, nextState, reward, fall, G_t, exp_action, advantage)
def insert(self, state, action, nextState, reward, fall=[[0]], G_t=[[0]], exp_action=[[0]], advantage=[[0]]):
# print "Instert State: " + str(state)
# state = list(state)
assert len(state[0]) == self._state_length, "len(state[0]) == self._state_length: " + str(self._state_length) + " state shape: " + str(np.asarray(state).shape)
assert len(action[0]) == self._action_length, "len(action[0]) == self._action_length: " + str(action)
assert len(nextState[0]) == self._result_state_length, "len(nextState[0]) == self._result_state shape: " + str(self._result_state_length) + " == " + str(np.asarray(nextState).shape)
assert len(reward[0]) == 1
assert len(fall[0]) == 1
assert len(G_t[0]) == 1
assert len(exp_action[0]) == 1
"""
state = list(state)
action = list(action)
nextState = list(nextState)
reward = list(reward)
nums = state+action+nextState+reward
"""
if ( checkValidData(state, action, nextState, reward) == False ):
print ("Failed inserting bad tuple: ")
return
if ( (self._history_update_index >= (self._history_size) )):
self._history_update_index=0
# print("Reset history index in exp buffer:")
# print ("Tuple: " + str(state) + ", " + str(action) + ", " + str(nextState) + ", " + str(reward))
# print ("action type: ", self._action_history.dtype)
self._state_history[self._history_update_index] = copy.deepcopy(np.array(state))
self._action_history[self._history_update_index] = copy.deepcopy(np.array(action))
# print("inserted action: ", self._action_history[self._history_update_index])
self._nextState_history[self._history_update_index] = copy.deepcopy(np.array(nextState))
self._reward_history[self._history_update_index] = copy.deepcopy(np.array(reward))
self._fall_history[self._history_update_index] = copy.deepcopy(np.array(fall))
self._discounted_sum_history[self._history_update_index] = copy.deepcopy(np.array(G_t))
self._advantage_history[self._history_update_index] = copy.deepcopy(np.array(advantage))
self._exp_action_history[self._history_update_index] = copy.deepcopy(np.array(exp_action))
# print ("fall: ", fall)
# print ("self._fall_history: ", self._fall_history[self._history_update_index])
self._inserts+=1
self._history_update_index+=1
self._samples+=1
self.updateScalling(state, action, nextState, reward)
def inserts(self):
return self._inserts
def samples(self):
return self._samples
def history_size(self):
return self._history_size
def updateScalling(self, state, action, nextState, reward):
if (self.inserts() == 1):
self._state_mean = self._state_history[0]
self._state_var = np.zeros_like(state)
self._reward_mean = self._reward_history[0]
self._reward_var = np.zeros_like(reward)
self._action_mean = self._action_history[0]
self._action_var = np.zeros_like(action)
else:
x_mean_old = self._state_mean
self._state_mean = self._state_mean + ((state - self._state_mean)/self.inserts())
reward_mean_old = self._reward_mean
self._reward_mean = self._reward_mean + ((reward - self._reward_mean)/self.inserts())
action_mean_old = self._action_mean
self._action_mean = self._action_mean + ((action - self._action_mean)/self.inserts())
if ( self.inserts() == 2):
self._state_var = (self._state_history[1] - ((self._state_history[0]+self._state_history[1])/2.0)**2)/2.0
self._reward_var = (self._reward_history[1] - ((self._reward_history[0]+self._reward_history[1])/2.0)**2)/2.0
self._action_var = (self._action_history[1] - ((self._action_history[0]+self._action_history[1])/2.0)**2)/2.0
elif (self.inserts() > 2):
self._state_var = (((self.inserts()-2)*self._state_var) + ((self.inserts()-1)*(x_mean_old - self._state_mean)**2) + ((state - self._state_mean)**2))
self._state_var = (self._state_var/float(self.inserts()-1))
self._reward_var = (((self.inserts()-2)*self._reward_var) + ((self.inserts()-1)*(reward_mean_old - self._reward_mean)**2) + ((reward - self._reward_mean)**2))
self._reward_var = (self._reward_var/float(self.inserts()-1))
self._action_var = (((self.inserts()-2)*self._action_var) + ((self.inserts()-1)*(action_mean_old - self._action_mean)**2) + ((action - self._action_mean)**2))
self._action_var = (self._action_var/float(self.inserts()-1))
# if ( 'state_normalization' in self._settings and self._settings["state_normalization"] == "adaptive"):
# self._updateScaling()
def _updateScaling(self):
scale_factor = 1.0
# state_std = np.maximum(np.sqrt(self._state_var[0]), 0.05)
state_std = np.sqrt(self._state_var[0])
# print("Running mean: ", self._state_mean)
# print("Running std: ", state_std)
low = self._state_mean[0] - (state_std*scale_factor)
high = self._state_mean[0] + (state_std*scale_factor)
# self.setStateBounds(np.array([low,high]))
self.setStateBounds(fixBounds(np.array([low,high])))
# print("New scaling parameters: ", self.getStateBounds())
# print("Running reward mean: ", self._reward_mean)
# print("Running reward std: ", np.sqrt(self._reward_var))
low = self._reward_mean[0] - (np.sqrt(self._reward_var[0])*scale_factor)
high = self._reward_mean[0] + (np.sqrt(self._reward_var[0])*scale_factor)
self.setRewardBounds(np.array([low,high]))
# print("New scaling parameters: ", self.getStateBounds())
"""
low = self._action_mean[0] - np.sqrt(self._action_var[0])
high = self._action_mean[0] + np.sqrt(self._action_var[0])
self.setActionBounds(np.array([low,high]))
"""
def get_exporation_action_batch(self, batch_size=32):
return self.get_batch(batch_size=batch_size, excludeActionTypes=[0])
def getNonMBAEBatch(self, batch_size=32):
"""
Avoids training critic on MBAE actions.
"""
return self.get_batch(batch_size=batch_size, excludeActionTypes=[2])
def get_batch(self, batch_size=32, excludeActionTypes=[]):
"""
len(experience > batch_size
"""
# assert batch_size <= self._history_size, "batch_size <= self._history_size: " + str(batch_size) +" <= " + str(self._history_size)
assert batch_size <= self.samples(), "batch_size <= self.samples(): " + str(batch_size) +" <= " + str(self.samples())
# indices = list(nprnd.randint(low=0, high=len(experience), size=batch_size))
max_size = min(self._history_size, self.samples())
# print ("Indicies: " , indices)
# print("Exp buff state bounds: ", self.getStateBounds())
state = []
action = []
resultState = []
reward = []
fall = []
G_ts = []
exp_actions = []
advantage = []
indices = set([])
trys = 0
### collect batch and try at most 3 times the batch size for valid tuples
while len(indices) < batch_size and (trys < batch_size*5):
# for i in indices:
trys = trys + 1
i = (random.sample(set(range(0, max_size))-indices, 1))[0]
## skip tuples that were not exploration actions
if ( self._exp_action_history[i] in excludeActionTypes):
continue
### Or if multitasking and only want to train policy on single task
# print ("self._fall_history[i]: ", self._fall_history[i])
if ( (type(self._settings["sim_config_file"]) is list)):
if ("worker_to_task_mapping" in self._settings
and (self._settings["worker_to_task_mapping"][self._fall_history[i][0]] is not 0)):
# print ("skipping non desired task tuple")
continue
indices.add(i)
if ( ('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])):
# state.append(self._state_history[i])
state.append(norm_state(self._state_history[i], self.getStateBounds()))
# print("Action pulled out: ", self._action_history[i])
action.append(self._action_history[i]) # won't work for discrete actions...
# action.append(norm_action(self._action_history[i], self.getActionBounds())) # won't work for discrete actions...
resultState.append(norm_state(self._nextState_history[i], self.getResultStateBounds()))
# resultState.append(self._nextState_history[i])
reward.append(norm_state(self._reward_history[i] , self.getRewardBounds()) * ((1.0-self._settings['discount_factor']))) # scale rewards
else:
state.append(norm_state(self._state_history[i], self.getStateBounds()))
# print("Action pulled out: ", self._action_history[i])
action.append(norm_action(self._action_history[i], self.getActionBounds())) # won't work for discrete actions...
resultState.append(norm_state(self._nextState_history[i], self.getResultStateBounds()))
reward.append(norm_state(self._reward_history[i] , self.getRewardBounds() ) * ((1.0-self._settings['discount_factor']))) # scale rewards
fall.append(self._fall_history[i])
G_ts.append(self._discounted_sum_history[i])
advantage.append(self._advantage_history[i])
exp_actions.append(self._exp_action_history[i])
# print c
# print experience[indices]
if (self._settings['float_type'] == 'float32'):
state = np.array(state, dtype='float32')
if (self._continuous_actions):
action = np.array(action, dtype='float32')
else:
action = np.array(action, dtype='int8')
resultState = np.array(resultState, dtype='float32')
reward = np.array(reward, dtype='float32')
# fall = np.array(fall, dtype='int8')
G_ts = np.array(G_ts, dtype='float32')
advantage = np.array(advantage, dtype='float32')
else:
state = np.array(state, dtype='float64')
if (self._continuous_actions):
action = np.array(action, dtype='float64')
else:
action = np.array(action, dtype='int8')
resultState = np.array(resultState, dtype='float64')
reward = np.array(reward, dtype='float64')
G_ts = np.array(G_ts, dtype='float64')
advantage = np.array(advantage, dtype='float32')
fall = np.array(fall, dtype='int8')
exp_actions = np.array(exp_actions, dtype='int8')
assert state.shape == (len(indices), self._state_length), "state.shape == (len(indices), self._state_length): " + str(state.shape) + " == " + str((len(indices), self._state_length))
assert action.shape == (len(indices), self._action_length), "action.shape == (len(indices), self._action_length): " + str(action.shape) + " == " + str((len(indices), self._action_length))
assert resultState.shape == (len(indices), self._result_state_length), "resultState.shape == (len(indices), self._result_state_length): " + str(resultState.shape) + " == " + str((len(indices), self._result_state_length))
assert reward.shape == (len(indices), 1), "reward.shape == (len(indices), 1): " + str(reward.shape) + " == " + str((len(indices), 1))
assert G_ts.shape == (len(indices), 1), "G_ts.shape == (len(indices), 1): " + str(G_ts.shape) + " == " + str((len(indices), 1))
assert fall.shape == (len(indices), 1), "fall.shape == (len(indices), 1): " + str(fall.shape) + " == " + str((len(indices), 1))
assert exp_actions.shape == (len(indices), 1), "exp_actions.shape == (len(indices), 1): " + str(exp_actions.shape) + " == " + str((len(indices), 1))
assert advantage.shape == (len(indices), 1), "G_ts.shape == (len(indices), 1): " + str(advantage.shape) + " == " + str((len(indices), 1))
# assert len(np.unique(indices)[0]) == batch_size, "np.unique(indices).shape[0] == batch_size: " + str(np.unique(indices).shape[0]) + " == " + str(batch_size)
return (state, action, resultState, reward, fall, G_ts, exp_actions, advantage)
def setStateBounds(self, _state_bounds):
self._state_bounds = _state_bounds
self.setResultStateBounds(_state_bounds)
def setRewardBounds(self, _reward_bounds):
self._reward_bounds = _reward_bounds
def setActionBounds(self, _action_bounds):
self._action_bounds = _action_bounds
def setResultStateBounds(self, _result_state_bounds):
self._result_state_bounds = _result_state_bounds
def getStateBounds(self):
return self._state_bounds
def getRewardBounds(self):
return self._reward_bounds
def getActionBounds(self):
return self._action_bounds
def getResultStateBounds(self):
return self._result_state_bounds
def setSettings(self, settings):
self._settings = settings
def getSettings(self):
return self._settings
def saveToFile(self, filename):
hf = h5py.File(filename, "w")
hf.create_dataset('_state_history', data=self._state_history)
hf.create_dataset('_action_history', data=self._action_history)
hf.create_dataset('_next_state_history', data=self._nextState_history)
hf.create_dataset('_reward_history', data=self._reward_history)
hf.create_dataset('_fall_history', data=self._fall_history)
hf.create_dataset('_discounted_sum_history', data=self._discounted_sum_history)
hf.create_dataset('_advantage_history', data=self._advantage_history)
hf.create_dataset('_exp_action_history', data=self._exp_action_history)
hf.create_dataset('_history_size', data=[self._history_size])
hf.create_dataset('_history_update_index', data=[self._history_update_index])
hf.create_dataset('_inserts', data=[self._inserts])
hf.create_dataset('_samples', data=[self._samples])
hf.create_dataset('_state_length', data=[self._state_length])
hf.create_dataset('_action_length', data=[self._action_length])
hf.create_dataset('_result_state_length', data=[self._result_state_length])
hf.create_dataset('_state_bounds', data=self._state_bounds)
hf.create_dataset('_reward_bounds', data=self._reward_bounds)
hf.create_dataset('_action_bounds', data=self._action_bounds)
hf.create_dataset('_result_state_bounds', data=self._result_state_bounds)
### Adaptive scaling values
hf.create_dataset('_state_mean', data=self._state_mean)
hf.create_dataset('_state_var', data=self._state_var)
hf.create_dataset('_reward_mean', data=self._reward_mean)
hf.create_dataset('_reward_var', data=self._reward_var)
hf.create_dataset('_action_mean', data=self._action_mean)
hf.create_dataset('_action_var', data=self._action_var)
### Save a variable length list of data
# data = np.array(self._trajectory_history, dtype=object)
if ((("train_LSTM_FD" in self._settings)
and (self._settings["train_LSTM_FD"] == True))
or
(("train_LSTM_Reward" in self._settings)
and (self._settings["train_LSTM_Reward"] == True))
):
grp = hf.create_group('trajectories')
if (self._settings["print_levels"][self._settings["print_level"]] >= self._settings["print_levels"]['train']):
print ("Saving trajectory data")
for i in range(min(self.history_size_Trajectory(), self.samplesTrajectory())):
list = self._trajectory_history[i]
# print (i,list)
if (list is not None):
grp_ = grp.create_group('traj'+str(i))
for it in range(len(list)):
grp_.create_dataset(str(it),data=np.array(list[it]))
else:
break
hf.create_dataset('_trajectory_size', data=[self._trajectory_size])
hf.create_dataset('_trajectory_update_index', data=[self._trajectory_update_index])
hf.create_dataset('_insertsTrajectory', data=[self._insertsTrajectory])
hf.create_dataset('_samplesTrajectory', data=[self._samplesTrajectory])
hf.flush()
hf.close()
def loadFromFile(self, filename):
hf = h5py.File(filename,'r')
self._state_history = np.array(hf.get('_state_history'))
self._action_history= np.array(hf.get('_action_history'))
self._nextState_history = np.array(hf.get('_next_state_history'))
self._reward_history = np.array(hf.get('_reward_history'))
self._fall_history = np.array(hf.get('_fall_history'))
self._discounted_sum_history = np.array(hf.get('_discounted_sum_history'))
self._advantage_history = np.array(hf.get('_advantage_history'))
self._exp_action_history = np.array(hf.get('_exp_action_history'))
self._history_size = int(hf.get('_history_size')[()])
self._history_update_index = int(hf.get('_history_update_index')[()])
self._inserts = int(hf.get('_inserts')[()])
self._samples = int(hf.get('_samples')[()])
self._state_length = int(hf.get('_state_length')[()])
self._action_length = int(hf.get('_action_length')[()])
self._result_state_length = int(hf.get('_result_state_length')[()])
self._state_bounds = np.array(hf.get('_state_bounds'))
self._reward_bounds = np.array(hf.get('_reward_bounds'))
self._action_bounds = np.array(hf.get('_action_bounds'))
self._result_state_bounds = np.array(hf.get('_result_state_bounds'))
### Adaptive scaling values
self._state_mean = np.array(hf.get('_state_mean'))
self._state_var = np.array(hf.get('_state_var'))
self._reward_mean = np.array(hf.get('_reward_mean'))
self._reward_var = np.array(hf.get('_reward_var'))
self._action_mean = np.array(hf.get('_action_mean'))
self._action_var = np.array(hf.get('_action_var'))
if ((("train_LSTM_FD" in self._settings)
and (self._settings["train_LSTM_FD"] == True))
or
(("train_LSTM_Reward" in self._settings)
and (self._settings["train_LSTM_Reward"] == True))
):
self._trajectory_size = int(hf.get('_trajectory_size')[()])
self._trajectory_update_index = int(hf.get('_trajectory_update_index')[()])
self._insertsTrajectory = int(hf.get('_insertsTrajectory')[()])
self._samplesTrajectory = int(hf.get('_samplesTrajectory')[()])
grp = hf.get('trajectories')
if (self._settings["print_levels"][self._settings["print_level"]] >= self._settings["print_levels"]['train']):
print ("Loading trajectory data")
for i in range(min(self.history_size_Trajectory(), self.samplesTrajectory())):
# print (i)
traj = []
grp_ = grp.get('traj'+str(i))
for it in range(8):
traj.append(np.array(grp_.get(str(it))))
self._trajectory_history[i] = traj
hf.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment