Skip to content

Instantly share code, notes, and snippets.

View AurelianTactics's full-sized avatar

AurelianTactics

View GitHub Profile
#critic loss
#self.critic_target is the TensorFlow placeholder that self.target_Q is fed into
normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms),
self.return_range[0], self.return_range[1])
if self.td3_variant:
logger.info('using TD3 variant loss')
self.critic_loss = tf.losses.mean_squared_error(normalized_critic_target_tf,self.normalized_critic_tf) \
+ tf.losses.mean_squared_error(normalized_critic_target_tf,self.normalized_critic_tf2)
else:
self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
if train_iter % self.td3_policy_freq == 0:
self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)
if train_iter % self.td3_policy_freq == 0:
self.sess.run(self.actor_target_soft_updates)
self.sess.run(self.critic_target_soft_updates)
if self.td3_policy_noise > 0:
noise = np.random.normal(loc=0.0,scale=self.td3_policy_noise,size=np.shape(batch['actions']))
noise = np.clip(noise,-self.td3_noise_clip,self.td3_noise_clip)
# Get all gradients and perform a synced update.
ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss]
actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
self.obs0: batch['obs0'],
self.actions: np.clip(batch['actions'] + noise,self.action_range[0],self.action_range[1]),
self.critic_target: target_Q,
})
@AurelianTactics
AurelianTactics / multiple_env.py
Created December 9, 2018 02:55
Baselines Multiple Env
#create a vector of multiple environment
#cleaner make_env example here: https://github.com/openai/baselines/blob/6e607efa905a5d5aedd8260afaecb5ad981d713c/baselines/common/cmd_util.py
def make_vec_env(args, time_int, start_index=0):
"""
Create a wrapped, monitored SubprocVecEnv
"""
mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
seed = args.seed + 10000 * mpi_rank if args.seed is not None else None
@AurelianTactics
AurelianTactics / custom_model.py
Last active December 9, 2018 03:29
Custom Model for Baselines
from baselines.common.models import register
from baselines.a2c import utils
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch
#define your network. this is the nature CNN with tf.nn.leaky_relu instead of relu
def custom_cnn(unscaled_images, **conv_kwargs):
scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
activ = tf.nn.leaky_relu
h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2),
**conv_kwargs))
@AurelianTactics
AurelianTactics / trfl_q_learning_tabular_example.py
Created December 31, 2018 04:35
trfl_q_learning_tabular_example.py
#Tabular Q Learning on CartPole using trfl.qlearing
#simple trfl.qlearning example using CartPole
num_actions = env.action_space.n
batch_size = 1
qt_tab = tf.placeholder(dtype=tf.float32,shape=[batch_size,num_actions],name="qt_tab")
qt_next_tab = tf.placeholder(dtype=tf.float32,shape=[batch_size,num_actions],name="qt_tab")
reward_tab = tf.placeholder(dtype=tf.float32,shape=[batch_size],name="reward_tab")
action_tab = tf.placeholder(dtype=tf.int32,shape=[batch_size],name="action_tab")
@AurelianTactics
AurelianTactics / trfl_tabular_q_learning.py
Created December 31, 2018 04:37
trfl_tabular_q_learning.py
#standard q_learning
# max_q_value = np.max(q_table[next_obs_vel_index,next_obs_angle_index,:])
# q_table[obs_vel_index,obs_angle_index,action] = q_table[obs_vel_index,obs_angle_index,action] \
# + alpha *(reward + gamma*max_q_value - q_table[obs_vel_index,obs_angle_index,action])
#with trfl.qlearning
qlearning_output = sess.run([q_learning_tab],feed_dict={qt_tab:np.expand_dims(q_table[obs_vel_index,obs_angle_index,:],axis=0),
qt_next_tab:np.expand_dims(q_table[next_obs_vel_index,next_obs_angle_index,:],axis=0),
reward_tab:np.expand_dims(reward,axis=0),
action_tab:np.expand_dims(action,axis=0)})
@AurelianTactics
AurelianTactics / trfl_q_learning_fa.py
Last active December 31, 2018 04:47
trfl_q_learning_fa.py
#standard way from tutorial: https://github.com/udacity/deep-learning/blob/master/reinforcement/Q-learning-cart.ipynb
#self.Q = tf.reduce_sum(tf.multiply(self.output, one_hot_actions), axis=1)
#self.loss = tf.reduce_mean(tf.square(self.targetQs_ - self.Q))
#self.opt = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
#TRFL way
self.targetQs_ = tf.placeholder(tf.float32, [batch_size,action_size], name='target')
self.reward = tf.placeholder(tf.float32,[batch_size],name="reward")
self.discount = tf.constant(0.99,shape=[batch_size],dtype=tf.float32,name="discount")
@AurelianTactics
AurelianTactics / trfl_q_learning_fa.py
Created December 31, 2018 04:49
trfl_q_learning_fa.py
#tutorial way
#targets = rewards + gamma * np.max(target_Qs, axis=1)
# loss, _ = sess.run([mainQN.loss, mainQN.opt],
# feed_dict={mainQN.inputs_: states,
# mainQN.targetQs_: targets,
# mainQN.actions_: actions})
#TRFL way, calculate td_error within TRFL
loss, _ = sess.run([mainQN.loss, mainQN.opt],
feed_dict={mainQN.inputs_: states,