Skip to content

Instantly share code, notes, and snippets.

@Flock1
Last active December 18, 2019 09:30
Show Gist options
  • Save Flock1/bdf0f95dc485c44a98cb0b4b14a5b325 to your computer and use it in GitHub Desktop.
Save Flock1/bdf0f95dc485c44a98cb0b4b14a5b325 to your computer and use it in GitHub Desktop.
def act(self, data,t): #state
rate = self.get_exploration_rate(t)
if random.random() < rate:
options = self.model.predict(data) #state
options = np.squeeze(options)
action = random.randrange(self.action_size)
else:
options = self.model.predict(data) #state
options = np.squeeze(options)
action = options.argmax()
return action, options, rate
def train(self):
batch_size = 200
t = 0 #increment
states, prob_actions, dlogps, drs, proj_data, reward_data =[], [], [], [], [], []
tr_x, tr_y = [],[]
avg_reward = []
reward_sum = 0
ep_number = 0
prev_state = None
first_step = True
new_state = self.value
data_inp = self.data
while ep_number<3000000:
prev_data = data_inp
prev_state = new_state
states.append(new_state)
action, probs, rate = self.act(data_inp,t)
prob_actions.append(probs)
y = np.zeros([self.action_size])
y[action] = 1
new_state = eval(command[action])
proj = projection(new_state, self.final_state)
data_inp = [proj,action]
data_inp = np.reshape(data_inp,(1,1,len(data_inp)))
tr_x.append(data_inp)
if(t==0):
rw = reward(proj,0)
drs.append(rw)
reward_sum+=rw
elif(t<4):
rw = reward(new_state, self.final_state)
drs.append(rw)
print("present reward: ", rw)
reward_sum+=rw
elif(t==4):
if not np.allclose(new_state, self.final_state):
rw = -1
drs.append(rw)
reward_sum+=rw
else:
rw = 1
drs.append(rw)
reward_sum+=rw
print("reward till now: ",reward_sum)
dlogps.append(np.array(y).astype('float32') * probs)
print("dlogps before time step: ", len(dlogps))
print("time step: ",t)
del(probs, action)
t+=1
if(t==5 or np.allclose(new_state,self.final_state)): #### Done State
ep_number+=1
ep_x = np.vstack(tr_x) #states
ep_dlogp = np.vstack(dlogps)
ep_reward = np.vstack(drs)
disc_rw = discounted_reward(ep_reward,self.gamma)
disc_rw = disc_rw.astype('float32')
disc_rw -= np.mean(disc_rw)
disc_rw /= np.std(disc_rw)
tr_y_len = len(ep_dlogp)
ep_dlogp*=disc_rw
if ep_number % batch_size == 0:
input_tr_y = prob_actions - self.learning_rate * ep_dlogp
input_tr_y = np.reshape(input_tr_y, (tr_y_len,1,6))
self.model.train_on_batch(ep_x, input_tr_y)
tr_x, dlogps, drs, states, prob_actions, reward_data = [],[],[],[],[],[]
env = Environment()
new_state = env.reset()
proj = projection(state, self.final_state)
data_inp = [proj,5]
data_inp = np.reshape(data_inp,(1,1,len(data_inp)))
print("State after resetting: ", new_state)
t=0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment