Skip to content

Instantly share code, notes, and snippets.

@BartKeulen
Last active March 14, 2017 08:47
Show Gist options
  • Save BartKeulen/4abcf03fce1f1f2c8f9d7d4bd257b304 to your computer and use it in GitHub Desktop.
Save BartKeulen/4abcf03fce1f1f2c8f9d7d4bd257b304 to your computer and use it in GitHub Desktop.
DDPG algorithm according to https://arxiv.org/abs/1509.02971. Implementation from https://pemami4911.github.io/blog/2016/08/21/ddpg-rl.html. Added Ornstein-Uhlenbeck process noise and exponential and -tanh noise decay.
"""
Data structure for implementing actor network for DDPG algorithm
Algorithm and hyperparameter details can be found here:
http://arxiv.org/pdf/1509.02971v2.pdf
Original author: Patrick Emami
Author: Bart Keulen
"""
import tensorflow as tf
import tflearn
class ActorNetwork(object):
def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, tau):
self.sess = sess
self.state_dim = state_dim
self.action_dim = action_dim
self.action_bound = action_bound
self.learning_rate = learning_rate
self.tau = tau
# Actor network
self.inputs, self.outputs, self.scaled_outputs = self.create_actor_network()
self.net_params = tf.trainable_variables()
# Target network
self.target_inputs, self.target_outputs, self.target_scaled_outputs = self.create_actor_network()
self.target_net_params = tf.trainable_variables()[len(self.net_params):]
# Op for periodically updating target network with online network weights
self.update_target_net_params = \
[self.target_net_params[i].assign(tf.mul(self.net_params[i], self.tau) +
tf.mul(self.target_net_params[i], 1. - self.tau))
for i in range(len(self.target_net_params))]
# Temporary placeholder action gradient
self.action_gradients = tf.placeholder(tf.float32, [None, self.action_dim])
# Combine dnetScaledOut/dnetParams with criticToActionGradient to get actorGradient
self.actor_gradients = tf.gradients(self.scaled_outputs, self.net_params, -self.action_gradients)
# Optimization Op
self.optimize = tf.train.AdamOptimizer(self.learning_rate).\
apply_gradients(zip(self.actor_gradients, self.net_params))
self.num_trainable_vars = len(self.net_params) + len(self.target_net_params)
def create_actor_network(self):
inputs = tflearn.input_data(shape=[None, self.state_dim])
net = tflearn.fully_connected(inputs, 400, activation='relu')
net = tflearn.fully_connected(net, 300, activation='relu')
# Final layer weight are initialized to Uniform[-3e-3, 3e-3]
weight_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003)
outputs = tflearn.fully_connected(net, self.action_dim, activation='tanh', weights_init=weight_init)
scaled_outputs = tf.mul(outputs, self.action_bound) # Scale output to [-action_bound, action_bound]
return inputs, outputs, scaled_outputs
def train(self, inputs, action_gradients):
return self.sess.run(self.optimize, feed_dict={
self.inputs: inputs,
self.action_gradients: action_gradients
})
def predict(self, inputs):
return self.sess.run(self.scaled_outputs, feed_dict={
self.inputs: inputs
})
def predict_target(self, inputs):
return self.sess.run(self.target_scaled_outputs, feed_dict={
self.target_inputs: inputs
})
def update_target_network(self):
self.sess.run(self.update_target_net_params)
def get_num_trainable_vars(self):
return self.num_trainable_vars
"""
Data structure for implementing critic network for DDPG algorithm
Algorithm and hyperparameter details can be found here:
http://arxiv.org/pdf/1509.02971v2.pdf
Original author: Patrick Emami
Author: Bart Keulen
"""
import tensorflow as tf
import tflearn
class CriticNetwork(object):
def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, tau, num_actor_vars):
self.sess = sess
self.state_dim = state_dim
self.action_dim = action_dim
self.action_bound = action_bound
self.learning_rate = learning_rate
self.tau = tau
# Critic network
self.inputs, self.action, self.outputs = self.create_critic_network()
self.net_params = tf.trainable_variables()[num_actor_vars:]
# Target network
self.target_inputs, self.target_action, self.target_outputs = self.create_critic_network()
self.target_net_params = tf.trainable_variables()[len(self.net_params) + num_actor_vars:]
# Op for periodically updating target network with online network weights
self.update_target_net_params = \
[self.target_net_params[i].assign(tf.mul(self.net_params[i], self.tau) +
tf.mul(self.target_net_params[i], 1. - self.tau))
for i in range(len(self.target_net_params))]
# Network target (y_i)
# Obtained from the target networks
self.predicted_q_value = tf.placeholder(tf.float32, [None, 1])
# Define loss and optimization Op
self.loss = tflearn.mean_square(self.predicted_q_value, self.outputs)
self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
# Get the gradient of the critic w.r.t. the action
self.action_grads = tf.gradients(self.outputs, self.action)
def create_critic_network(self):
inputs = tflearn.input_data(shape=[None, self.state_dim])
action = tflearn.input_data(shape=[None, self.action_dim])
net = tflearn.fully_connected(inputs, 400, activation='relu')
# Add the action tensor in the 2nd hidden layer
# Use two temp layers to get the corresponding weights and biases
t1 = tflearn.fully_connected(net, 300)
t2 = tflearn.fully_connected(action, 300)
net = tflearn.activation(tf.matmul(net, t1.W) + tf.matmul(action, t2.W) + t2.b, activation='relu')
# Linear layer connected to 1 output representing Q(s,a)
# Weights are init to Uniform[-3e-3, 3e-3]
weight_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003)
outputs = tflearn.fully_connected(net, 1, weights_init=weight_init)
return inputs, action, outputs
def train(self, inputs, action, predicted_q_value):
return self.sess.run([self.outputs, self.optimize], feed_dict={
self.inputs: inputs,
self.action: action,
self.predicted_q_value: predicted_q_value
})
def predict(self, inputs, action):
return self.sess.run(self.outputs, feed_dict={
self.inputs: inputs,
self.action: action
})
def predict_target(self, inputs, action):
return self.sess.run(self.target_outputs, feed_dict={
self.target_inputs: inputs,
self.target_action: action
})
def action_gradients(self, inputs, action):
return self.sess.run(self.action_grads, feed_dict={
self.inputs: inputs,
self.action: action
})
def update_target_network(self):
self.sess.run(self.update_target_net_params)
"""
Implementation of DDPG - Deep Deterministic Policy Gradient
Algorithm and hyperparameter details can be found here:
http://arxiv.org/pdf/1509.02971v2.pdf
The algorithm is tested on the Pendulum-v0 OpenAI gym task
and developed with tflearn + Tensorflow
Original author: Patrick Emami
Author: Bart Keulen
"""
import numpy as np
import datetime
import gym
from gym.wrappers import Monitor
import tensorflow as tf
from actor import ActorNetwork
from critic import CriticNetwork
from replaybuffer import ReplayBuffer
from explorationnoise import ExplorationNoise
# ================================
# TRAINING PARAMETERS
# ================================
# Learning rates actor and critic
ACTOR_LEARNING_RATE = 0.0001
CRITIC_LEARNING_RATE = 0.001
# Maximum number of episodes
MAX_EPISODES = 1000
# Maximum number of steps per episode
MAX_STEPS_EPISODE = 500
# Discount factor
GAMMA = 0.99
# Soft target update parameter
TAU = 0.001
# Size of replay buffer
BUFFER_SIZE = 1000000
MINIBATCH_SIZE = 64
# Exploration noise variables
NOISE_MEAN = 0
NOISE_VAR = 1
# Ornstein-Uhlenbeck variables
OU_THETA = 0.15
OU_MU = 0.
OU_SIGMA = 0.3
# Exploration duration
EXPLORATION_TIME = 200
# ================================
# UTILITY PARAMETERS
# ================================
# Gym environment name
ENV_NAME = 'Pendulum-v0'
# ENV_NAME = 'MountainCarContinuous-v0'
# Render gym env during training
RENDER_ENV = False
# Use Gym Monitor
GYM_MONITOR_EN = True
# Upload results to openAI
UPLOAD_GYM_RESULTS = False
GYM_API_KEY = '..............'
# Directory for storing gym results
DATETIME = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
MONITOR_DIR = './results/{}/{}/gym_ddpg'.format(ENV_NAME, DATETIME)
# Directory for storing tensorboard summary results
SUMMARY_DIR = './results/{}/{}/tf_ddpg'.format(ENV_NAME, DATETIME)
RANDOM_SEED = 1234
# ================================
# TENSORFLOW SUMMARY OPS
# ================================
def build_summaries():
episode_reward = tf.Variable(0.)
tf.summary.scalar('Reward', episode_reward)
episode_ave_max_q = tf.Variable(0.)
tf.summary.scalar('Qmax Value', episode_ave_max_q)
summary_vars = [episode_reward, episode_ave_max_q]
summary_ops = tf.summary.merge_all()
return summary_ops, summary_vars
# ================================
# TRAIN AGENT
# ================================
def train(sess, env, actor, critic):
# Set up summary ops
summary_ops, summary_vars = build_summaries()
# Initialize Tensorflow variables
sess.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)
# Initialize target network weights
actor.update_target_network()
critic.update_target_network()
# Initialize replay memory
replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
for i in xrange(MAX_EPISODES):
s = env.reset()
episode_reward = 0
episode_ave_max_q = 0
noise = ExplorationNoise.ou_noise(OU_THETA, OU_MU, OU_SIGMA, MAX_STEPS_EPISODE)
noise = ExplorationNoise.exp_decay(noise, EXPLORATION_TIME)
for j in xrange(MAX_STEPS_EPISODE):
if RENDER_ENV:
env.render()
# Add exploratory noise according to Ornstein-Uhlenbeck process to action
# Decay exploration exponentially from 1 to 0 in EXPLORATION_TIME steps
if i < EXPLORATION_TIME:
a = actor.predict(np.reshape(s, (1, env.observation_space.shape[0]))) + noise[j]
else:
a = actor.predict(np.reshape(s, (1, env.observation_space.shape[0])))
s2, r, terminal, info = env.step(a[0])
replay_buffer.add(np.reshape(s, actor.state_dim),
np.reshape(a, actor.action_dim), r, terminal,
np.reshape(s2, actor.state_dim))
# Keep adding experience to the memory until
# there are at least minibatch size samples
if replay_buffer.size() > MINIBATCH_SIZE:
s_batch, a_batch, r_batch, t_batch, s2_batch = \
replay_buffer.sample_batch(MINIBATCH_SIZE)
# Calculate targets
target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch))
y_i = []
for k in xrange(MINIBATCH_SIZE):
# If state is terminal assign reward only
if t_batch[k]:
y_i.append(r_batch[k])
# Else assgin reward + net target Q
else:
y_i.append(r_batch[k] + GAMMA * target_q[k])
# Update the critic given the targets
predicted_q_value, _ = \
critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))
episode_ave_max_q += np.amax(predicted_q_value)
# Update the actor policy using the sampled gradient
a_outs = actor.predict(s_batch)
a_grads = critic.action_gradients(s_batch, a_outs)
actor.train(s_batch, a_grads[0])
# Update target networks
actor.update_target_network()
critic.update_target_network()
s = s2
episode_reward += r
if terminal or j == MAX_STEPS_EPISODE-1:
summary_str = sess.run(summary_ops, feed_dict={
summary_vars[0]: episode_reward,
summary_vars[1]: episode_ave_max_q
})
writer.add_summary(summary_str, i)
writer.flush()
print 'Reward: %.2i' % int(episode_reward), ' | Episode', i, \
'| Qmax: %.4f' % (episode_ave_max_q / float(j))
break
# ================================
# MAIN
# ================================
def main(_):
with tf.Session() as sess:
env = gym.make(ENV_NAME)
# np.random.seed(RANDOM_SEED)
tf.set_random_seed(RANDOM_SEED)
env.seed(RANDOM_SEED)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high
# Ensure action bound is symmetric
assert(env.action_space.high == -env.action_space.low)
actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
ACTOR_LEARNING_RATE, TAU)
critic = CriticNetwork(sess, state_dim, action_dim, action_bound,
CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars())
if GYM_MONITOR_EN:
if not RENDER_ENV:
env = Monitor(env, MONITOR_DIR, video_callable=False, force=True)
else:
env = Monitor(env, MONITOR_DIR, force=True)
train(sess, env, actor, critic)
if UPLOAD_GYM_RESULTS:
#gym.upload(MONITOR_DIR, api_key=GYM_API_KEY)
if __name__ == '__main__':
tf.app.run()
"""
Package containing different types of exploration noise:
- White noise
- Ornstein-Uhlenbeck process
- Noise decay
Author: Bart Keulen
"""
import numpy as np
class ExplorationNoise(object):
# ================================
# WHITE NOISE PROCESS
# ================================
@staticmethod
def white_noise(mu, sigma, num_steps):
# Generate random noise with mean 0 and variance 1
return np.random.normal(mu, sigma, num_steps)
# ================================
# ORNSTEIN-UHLENBECK PROCESS
# ================================
@staticmethod
def ou_noise(theta, mu, sigma, num_steps, dt=1.):
noise = np.zeros(num_steps)
# Generate random noise with mean 0 and variance 1
white_noise = np.random.normal(0, 1, num_steps)
# Solve using Euler-Maruyama method
for i in xrange(1, num_steps):
noise[i] = noise[i - 1] + theta * (mu - noise[i - 1]) * \
dt + sigma * np.sqrt(dt) * white_noise[i]
return noise
# ================================
# EXPONENTIAL NOISE DECAY
# ================================
@staticmethod
def exp_decay(noise, decay_end):
num_steps = noise.shape[0]
# Check if decay ends before end of noise sequence
assert(decay_end <= num_steps)
scaling = np.zeros(num_steps)
scaling[:decay_end] = 2. - np.exp(np.divide(np.linspace(1., decay_end, num=decay_end) * np.log(2.), decay_end))
return np.multiply(noise, scaling)
# ================================
# TANH NOISE DECAY
# ================================
@staticmethod
def tanh_decay(noise, decay_start, decay_length):
num_steps = noise.shape[0]
# Check if decay ends before end of noise sequence
assert(decay_start + decay_length <= num_steps)
scaling = 0.5*(1. - np.tanh(4. / decay_length * np.subtract(np.linspace(1., num_steps, num_steps),
decay_start + decay_length/2.)))
return np.multiply(noise, scaling)
"""
Data structure for implementing experience replay
Author: Patrick Emami
"""
from collections import deque
import random
import numpy as np
class ReplayBuffer(object):
def __init__(self, buffer_size, random_seed=1234):
self.buffer_size = buffer_size
self.count = 0
# Right side of deque contains newest experience
self.buffer = deque()
random.seed(random_seed)
def add(self, s, a, r, t, s2):
experience = (s, a, r, t, s2)
if self.count < self.buffer_size:
self.buffer.append(experience)
self.count += 1
else:
self.buffer.popleft()
self.buffer.append(experience)
def size(self):
return self.count
def sample_batch(self, batch_size):
batch = []
if self.count < batch_size:
batch = random.sample(self.buffer, self.count)
else:
batch = random.sample(self.buffer, batch_size)
s_batch = np.array([_[0] for _ in batch])
a_batch = np.array([_[1] for _ in batch])
r_batch = np.array([_[2] for _ in batch])
t_batch = np.array([_[3] for _ in batch])
s2_batch = np.array([_[4] for _ in batch])
return s_batch, a_batch, r_batch, t_batch, s2_batch
def clear(self):
self.buffer.clear()
self.count = 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment