Skip to content

Instantly share code, notes, and snippets.

@hccho2
Last active March 6, 2020 06:38
Show Gist options
  • Save hccho2/6c9d3dd1726875d26b3509e91c8ca4f1 to your computer and use it in GitHub Desktop.
Save hccho2/6c9d3dd1726875d26b3509e91c8ca4f1 to your computer and use it in GitHub Desktop.
pendulum-ddpg
'''
ddpg
'''
import tensorflow as tf
import numpy as np
import gym
import time,os,random
from collections import deque
##################### hyper parameters ####################
MAX_EP_STEPS = 200
LR_A = 0.0001 # learning rate for actor
LR_C = 0.001 # learning rate for critic
GAMMA = 0.99 # reward discount
TAU = 0.005 # soft replacement
MEMORY_CAPACITY = 100000
BATCH_SIZE = 64
RENDER = False
ENV_NAME = 'Pendulum-v0'
############################### DDPG ####################################
class DDPG(object):
def __init__(self, a_dim, s_dim, a_bound,):
self.replay_buffer = deque(maxlen = MEMORY_CAPACITY) # state, next_state, action, reward
self.sess = tf.Session()
self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
self.S_next = tf.placeholder(tf.float32, [None, s_dim], 's_next') # next state
self.R = tf.placeholder(tf.float32, [None, 1], 'r')
self.a = self._build_a(self.S,)
q = self._build_c(self.S, self.a, )
a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Actor')
c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Critic')
ema = tf.train.ExponentialMovingAverage(decay=1 - TAU) # soft replacement
def ema_getter(getter, name, *args, **kwargs):
return ema.average(getter(name, *args, **kwargs)) # shadow variable에 접근
target_update = [ema.apply(a_params), ema.apply(c_params)] # soft update operation(shadow variable이 생성된다)
a_ = self._build_a(self.S_next, reuse=True, custom_getter=ema_getter) # replaced target parameters(shadow variable로 network 만듬)
q_ = self._build_c(self.S_next, a_, reuse=True, custom_getter=ema_getter)
a_loss = - tf.reduce_mean(q) # maximize the q
self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=a_params)
with tf.control_dependencies(target_update): # soft replacement happened at here
q_target = self.R + GAMMA * q_
td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=c_params)
self.sess.run(tf.global_variables_initializer())
def choose_action(self, s):
return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]
def train(self):
trainBatch = random.sample(self.replay_buffer, BATCH_SIZE)
states = np.stack([x[0] for x in trainBatch]) # (N,3)
actions = np.array([x[1] for x in trainBatch])
rewards = np.array([x[2] for x in trainBatch])
next_states = np.stack([x[3] for x in trainBatch])
self.sess.run(self.atrain, {self.S: states})
self.sess.run(self.ctrain, {self.S: states, self.a: actions, self.R: rewards, self.S_next: next_states})
def store_transition(self, s, a, r, s_):
self.replay_buffer.append((s, a.flatten(), r.flatten(), s_))
def _build_a(self, s, reuse=None, custom_getter=None):
trainable = True if reuse is None else False
with tf.variable_scope('Actor', reuse=reuse, custom_getter=custom_getter):
net = tf.layers.dense(s, 100, activation=tf.nn.relu, name='l1', trainable=trainable)
net = tf.layers.dense(net, 30, activation=tf.nn.relu, name='l2', trainable=trainable)
a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable)
return tf.multiply(a, self.a_bound, name='scaled_a')
def _build_c(self, s, a, reuse=None, custom_getter=None):
trainable = True if reuse is None else False
with tf.variable_scope('Critic', reuse=reuse, custom_getter=custom_getter):
net_s = tf.layers.dense(s, 100, activation=tf.nn.relu, name='Ls1', trainable=trainable)
net_s = tf.layers.dense(net_s, 30, activation=None, name='ls2', trainable=trainable)
net_a = tf.layers.dense(a, 30, activation=None,use_bias=False, name='la1', trainable=trainable)
net = tf.nn.relu(net_s + net_a)
return tf.layers.dense(net, 1, trainable=trainable) # Q(s,a)
class OrnsteinUhlenbeckActionNoise:
def __init__(self, mu, sigma=0.3, theta=0.15, dt=1e-2, x0=None):
self.theta = theta
self.mu = mu
self.sigma = sigma
self.dt = dt
self.x0 = x0
self.reset()
def __call__(self):
x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
self.x_prev = x
return x
def reset(self):
self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)
def __repr__(self):
return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)
############################### training ####################################
def train():
env = gym.make(ENV_NAME)
env = env.unwrapped # max step 200이 풀린다.
env.seed(1)
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
a_bound = env.action_space.high
ddpg = DDPG(a_dim, s_dim, a_bound)
s_time = time.time()
episode_reward_history = deque(maxlen = 100)
saver=tf.train.Saver()
checkpoint_dir = "./ddpg-model"
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
save_path = os.path.join(checkpoint_dir, "model.ckpt")
if tf.train.latest_checkpoint(checkpoint_dir):
n_episode = int(tf.train.latest_checkpoint(checkpoint_dir).split('-')[-1])
else:
n_episode=0
exploration_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(a_dim))
noise_scaling = 0.1 * (a_bound*2)
best_avg_reward = -180
while True:
n_episode += 1
s = env.reset()
exploration_noise.reset()
ep_reward = 0
for j in range(MAX_EP_STEPS):
if RENDER:
env.render()
# Add exploration noise
a = ddpg.choose_action(s)
#a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration
noise = exploration_noise() * noise_scaling
a = np.clip(a + noise, -2, 2)
s_, r, done, info = env.step(a)
ddpg.store_transition(s, a, r / 10, s_)
if len(ddpg.replay_buffer) > BATCH_SIZE*10:
ddpg.train()
s = s_
ep_reward += r
if j == MAX_EP_STEPS-1:
episode_reward_history.append(ep_reward)
if n_episode% 10 ==0:
print('Episode:', n_episode, ' 100 avg. Reward: %.2f' % np.mean(episode_reward_history), 'noise: %.2f' % noise, )
if best_avg_reward * 0.99 <= np.mean(episode_reward_history):
best_avg_reward = np.mean(episode_reward_history)
saver.save(ddpg.sess,save_path,global_step=n_episode)
print('Checkpoint Saved to {}/{}/{}. elapsed = {}'.format(save_path, n_episode,best_avg_reward ,time.time()-s_time))
#if ep_reward > -300:RENDER = True
break
print('Running time: ', time.time() - s_time)
def test():
env = gym.make(ENV_NAME)
env = env.unwrapped
env.seed(1)
env._max_episode_steps = 1000
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
a_bound = env.action_space.high
ddpg = DDPG(a_dim, s_dim, a_bound)
saver=tf.train.Saver()
checkpoint_dir = "./ddpg-model"
save_path = os.path.join(checkpoint_dir, "model.ckpt")
maybe_path = tf.train.latest_checkpoint(checkpoint_dir)
if os.path.exists(checkpoint_dir) and maybe_path:
print("Restored {}".format(maybe_path))
saver.restore(ddpg.sess, maybe_path)
else:
print("No model is found")
exit()
env = gym.wrappers.Monitor(env, './movie/', force=True)
state = env.reset()
episode_reward_history = deque(maxlen = 100)
random_episodes = 0
total_reward = 0
n_step=0
state = env.reset()
while random_episodes < 1:
env.render()
action = ddpg.choose_action(state)
#action = np.clip(np.random.normal(action, var), -2, 2) # add randomness to action selection for exploration
state, reward, done, info = env.step(action)
total_reward += reward
n_step += 1
if n_step% 50 ==0:
print(n_step, total_reward, action)
if n_step > 1000: done=True # error가 나지만, 이렇게 멈추어야 mp4파일이 만들어짐
if done:
print("Reward for this episode {} was: {}. n_step = {}".format(random_episodes,total_reward,n_step))
total_reward = 0
state = env.reset()
random_episodes += 1
total_reward = 0
n_step = 0
if __name__ == '__main__':
train()
#test()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment