Last active
March 6, 2020 06:38
-
-
Save hccho2/6c9d3dd1726875d26b3509e91c8ca4f1 to your computer and use it in GitHub Desktop.
pendulum-ddpg
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
ddpg | |
''' | |
import tensorflow as tf | |
import numpy as np | |
import gym | |
import time,os,random | |
from collections import deque | |
##################### hyper parameters #################### | |
MAX_EP_STEPS = 200 | |
LR_A = 0.0001 # learning rate for actor | |
LR_C = 0.001 # learning rate for critic | |
GAMMA = 0.99 # reward discount | |
TAU = 0.005 # soft replacement | |
MEMORY_CAPACITY = 100000 | |
BATCH_SIZE = 64 | |
RENDER = False | |
ENV_NAME = 'Pendulum-v0' | |
############################### DDPG #################################### | |
class DDPG(object): | |
def __init__(self, a_dim, s_dim, a_bound,): | |
self.replay_buffer = deque(maxlen = MEMORY_CAPACITY) # state, next_state, action, reward | |
self.sess = tf.Session() | |
self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound, | |
self.S = tf.placeholder(tf.float32, [None, s_dim], 's') | |
self.S_next = tf.placeholder(tf.float32, [None, s_dim], 's_next') # next state | |
self.R = tf.placeholder(tf.float32, [None, 1], 'r') | |
self.a = self._build_a(self.S,) | |
q = self._build_c(self.S, self.a, ) | |
a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Actor') | |
c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Critic') | |
ema = tf.train.ExponentialMovingAverage(decay=1 - TAU) # soft replacement | |
def ema_getter(getter, name, *args, **kwargs): | |
return ema.average(getter(name, *args, **kwargs)) # shadow variable에 접근 | |
target_update = [ema.apply(a_params), ema.apply(c_params)] # soft update operation(shadow variable이 생성된다) | |
a_ = self._build_a(self.S_next, reuse=True, custom_getter=ema_getter) # replaced target parameters(shadow variable로 network 만듬) | |
q_ = self._build_c(self.S_next, a_, reuse=True, custom_getter=ema_getter) | |
a_loss = - tf.reduce_mean(q) # maximize the q | |
self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=a_params) | |
with tf.control_dependencies(target_update): # soft replacement happened at here | |
q_target = self.R + GAMMA * q_ | |
td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q) | |
self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=c_params) | |
self.sess.run(tf.global_variables_initializer()) | |
def choose_action(self, s): | |
return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0] | |
def train(self): | |
trainBatch = random.sample(self.replay_buffer, BATCH_SIZE) | |
states = np.stack([x[0] for x in trainBatch]) # (N,3) | |
actions = np.array([x[1] for x in trainBatch]) | |
rewards = np.array([x[2] for x in trainBatch]) | |
next_states = np.stack([x[3] for x in trainBatch]) | |
self.sess.run(self.atrain, {self.S: states}) | |
self.sess.run(self.ctrain, {self.S: states, self.a: actions, self.R: rewards, self.S_next: next_states}) | |
def store_transition(self, s, a, r, s_): | |
self.replay_buffer.append((s, a.flatten(), r.flatten(), s_)) | |
def _build_a(self, s, reuse=None, custom_getter=None): | |
trainable = True if reuse is None else False | |
with tf.variable_scope('Actor', reuse=reuse, custom_getter=custom_getter): | |
net = tf.layers.dense(s, 100, activation=tf.nn.relu, name='l1', trainable=trainable) | |
net = tf.layers.dense(net, 30, activation=tf.nn.relu, name='l2', trainable=trainable) | |
a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable) | |
return tf.multiply(a, self.a_bound, name='scaled_a') | |
def _build_c(self, s, a, reuse=None, custom_getter=None): | |
trainable = True if reuse is None else False | |
with tf.variable_scope('Critic', reuse=reuse, custom_getter=custom_getter): | |
net_s = tf.layers.dense(s, 100, activation=tf.nn.relu, name='Ls1', trainable=trainable) | |
net_s = tf.layers.dense(net_s, 30, activation=None, name='ls2', trainable=trainable) | |
net_a = tf.layers.dense(a, 30, activation=None,use_bias=False, name='la1', trainable=trainable) | |
net = tf.nn.relu(net_s + net_a) | |
return tf.layers.dense(net, 1, trainable=trainable) # Q(s,a) | |
class OrnsteinUhlenbeckActionNoise: | |
def __init__(self, mu, sigma=0.3, theta=0.15, dt=1e-2, x0=None): | |
self.theta = theta | |
self.mu = mu | |
self.sigma = sigma | |
self.dt = dt | |
self.x0 = x0 | |
self.reset() | |
def __call__(self): | |
x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \ | |
self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape) | |
self.x_prev = x | |
return x | |
def reset(self): | |
self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu) | |
def __repr__(self): | |
return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma) | |
############################### training #################################### | |
def train(): | |
env = gym.make(ENV_NAME) | |
env = env.unwrapped # max step 200이 풀린다. | |
env.seed(1) | |
s_dim = env.observation_space.shape[0] | |
a_dim = env.action_space.shape[0] | |
a_bound = env.action_space.high | |
ddpg = DDPG(a_dim, s_dim, a_bound) | |
s_time = time.time() | |
episode_reward_history = deque(maxlen = 100) | |
saver=tf.train.Saver() | |
checkpoint_dir = "./ddpg-model" | |
if not os.path.exists(checkpoint_dir): | |
os.makedirs(checkpoint_dir) | |
save_path = os.path.join(checkpoint_dir, "model.ckpt") | |
if tf.train.latest_checkpoint(checkpoint_dir): | |
n_episode = int(tf.train.latest_checkpoint(checkpoint_dir).split('-')[-1]) | |
else: | |
n_episode=0 | |
exploration_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(a_dim)) | |
noise_scaling = 0.1 * (a_bound*2) | |
best_avg_reward = -180 | |
while True: | |
n_episode += 1 | |
s = env.reset() | |
exploration_noise.reset() | |
ep_reward = 0 | |
for j in range(MAX_EP_STEPS): | |
if RENDER: | |
env.render() | |
# Add exploration noise | |
a = ddpg.choose_action(s) | |
#a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration | |
noise = exploration_noise() * noise_scaling | |
a = np.clip(a + noise, -2, 2) | |
s_, r, done, info = env.step(a) | |
ddpg.store_transition(s, a, r / 10, s_) | |
if len(ddpg.replay_buffer) > BATCH_SIZE*10: | |
ddpg.train() | |
s = s_ | |
ep_reward += r | |
if j == MAX_EP_STEPS-1: | |
episode_reward_history.append(ep_reward) | |
if n_episode% 10 ==0: | |
print('Episode:', n_episode, ' 100 avg. Reward: %.2f' % np.mean(episode_reward_history), 'noise: %.2f' % noise, ) | |
if best_avg_reward * 0.99 <= np.mean(episode_reward_history): | |
best_avg_reward = np.mean(episode_reward_history) | |
saver.save(ddpg.sess,save_path,global_step=n_episode) | |
print('Checkpoint Saved to {}/{}/{}. elapsed = {}'.format(save_path, n_episode,best_avg_reward ,time.time()-s_time)) | |
#if ep_reward > -300:RENDER = True | |
break | |
print('Running time: ', time.time() - s_time) | |
def test(): | |
env = gym.make(ENV_NAME) | |
env = env.unwrapped | |
env.seed(1) | |
env._max_episode_steps = 1000 | |
s_dim = env.observation_space.shape[0] | |
a_dim = env.action_space.shape[0] | |
a_bound = env.action_space.high | |
ddpg = DDPG(a_dim, s_dim, a_bound) | |
saver=tf.train.Saver() | |
checkpoint_dir = "./ddpg-model" | |
save_path = os.path.join(checkpoint_dir, "model.ckpt") | |
maybe_path = tf.train.latest_checkpoint(checkpoint_dir) | |
if os.path.exists(checkpoint_dir) and maybe_path: | |
print("Restored {}".format(maybe_path)) | |
saver.restore(ddpg.sess, maybe_path) | |
else: | |
print("No model is found") | |
exit() | |
env = gym.wrappers.Monitor(env, './movie/', force=True) | |
state = env.reset() | |
episode_reward_history = deque(maxlen = 100) | |
random_episodes = 0 | |
total_reward = 0 | |
n_step=0 | |
state = env.reset() | |
while random_episodes < 1: | |
env.render() | |
action = ddpg.choose_action(state) | |
#action = np.clip(np.random.normal(action, var), -2, 2) # add randomness to action selection for exploration | |
state, reward, done, info = env.step(action) | |
total_reward += reward | |
n_step += 1 | |
if n_step% 50 ==0: | |
print(n_step, total_reward, action) | |
if n_step > 1000: done=True # error가 나지만, 이렇게 멈추어야 mp4파일이 만들어짐 | |
if done: | |
print("Reward for this episode {} was: {}. n_step = {}".format(random_episodes,total_reward,n_step)) | |
total_reward = 0 | |
state = env.reset() | |
random_episodes += 1 | |
total_reward = 0 | |
n_step = 0 | |
if __name__ == '__main__': | |
train() | |
#test() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment