Skip to content

Instantly share code, notes, and snippets.

@flxh
Last active January 31, 2019 21:05
Show Gist options
  • Save flxh/b6bff1dbdb8d3f5023c3542e5e3f72f9 to your computer and use it in GitHub Desktop.
Save flxh/b6bff1dbdb8d3f5023c3542e5e3f72f9 to your computer and use it in GitHub Desktop.
soft actor critic
import tensorflow as tf
import tensorflow_probability as tfp
import gym
from collections import deque
from random import sample
import numpy as np
from datetime import datetime
COMMENT = 'comment'
GRADIENT_NORM = 5
SIGMA = 0.1
class Policy:
def __init__(self, sess, state_size, action_size, lr, batch_size, alpha_entropy, policy_reg_coeff, kernel_reg):
self.action_size = action_size
self.state_size = state_size
self.sess = sess
self.kernel_reg = kernel_reg
with tf.variable_scope("Policy"):
with tf.variable_scope("model"):
self.state_input, self.model_output = self._create_model()
mean_action_lin = self.model_output[..., action_size:]
self.mean_action = tf.tanh(mean_action_lin)
self.log_sigma = self.model_output[..., :action_size]
self.sigma = tf.exp(self.log_sigma)
newsigma=tf.constant([SIGMA]*ACTION_SIZE)
dist = tfp.distributions.MultivariateNormalDiag(loc=self.mean_action, scale_diag=newsigma)
#dist = tfp.distributions.MultivariateNormalDiag(loc=self.mean_action, scale_diag=self.sigma)
self.variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "Policy/model")
self.action = dist.sample()
with tf.variable_scope("probs"):
self.probs = dist.prob(self.action)
self.log_prob_op = tf.reshape(tf.log(self.probs),[-1, 1])
with tf.variable_scope("saver"):
self.saver = tf.train.Saver(var_list=self.variables)
with tf.variable_scope("training"):
self.q_grad_a = tf.placeholder(tf.float32, [None, action_size])
self.training_action = tf.placeholder(tf.float32, [None, action_size])
self.training_probs = dist.prob(self.training_action)
self.training_log_prob_op = tf.reshape(tf.log(self.training_probs),[-1, 1])
self.log_prob_grad_a = tf.gradients(self.training_log_prob_op, self.training_action)
# gravitates toward stddev of 0.5
#self.pol_reg_loss = tf.reduce_sum(tf.maximum(self.sigma-SIGMA, 0)**2)
# mean can move freely between -3...3 -> tanh -0.995 ..0.995
self.pol_reg_loss = tf.reduce_sum(tf.maximum(tf.abs(self.model_output)-3, 0)**2)
self.entropy_summary = tf.summary.scalar("entropy", - tf.reduce_mean(self.training_log_prob_op*self.training_probs))
self.log_prob_summary = tf.summary.scalar("mean_log_prob", - tf.reduce_mean(self.training_log_prob_op))
self.regularization_loss_summary = tf.summary.scalar("reg_loss", self.pol_reg_loss)
self.training_summaries = tf.summary.merge([self.entropy_summary, self.log_prob_summary, self.regularization_loss_summary])
#output regularization
grad_reg = tf.gradients(self.pol_reg_loss, self.variables, name="grad_reg")
#maximizes reward
grad_s1 = tf.gradients(self.action, self.variables, (self.log_prob_grad_a-self.q_grad_a), name="grad_s1")
#maximizes entropy
grad_s2 = tf.gradients(self.training_log_prob_op, self.variables, name="grad_s2")
with tf.variable_scope("gradient"):
self.grad = [(grad_s1[i] + alpha_entropy * grad_s2[i] + policy_reg_coeff * grad_reg[i])/batch_size for i in range(len(grad_s1))]
self.grad = [tf.clip_by_value(g, -1, 1) for g in self.grad]
self.clipped_grad,_ = tf.clip_by_global_norm(self.grad, GRADIENT_NORM)
self.c_grad_summaries = [tf.summary.histogram("pol_grads_{}".format(i), self.clipped_grad[i]) for i in range(len(self.clipped_grad))]
grads = zip(self.clipped_grad, self.variables)
optimizer = tf.train.AdamOptimizer(lr)
self.optimize = optimizer.apply_gradients(grads)
self.summary_op = tf.summary.merge_all(scope="Policy/model")
def get_summaries(self):
return self.sess.run(self.summary_op)
def predict(self, state):
return self.sess.run([self.action, self.log_prob_op, self.mean_action, self.sigma], feed_dict={
self.state_input: state
})
def train(self, states, actions, q_grad_a):
es, gs, _ =self.sess.run([self.training_summaries, self.c_grad_summaries, self.optimize], feed_dict={
self.q_grad_a:q_grad_a,
self.state_input: states,
self.training_action: actions
})
return es, gs
def _create_model(self):
layer_names = ["layer-1", "layer-2", "layer-3"]
state_input = tf.placeholder(tf.float32, [None, self.state_size])
d1 = tf.layers.Dense(64, activation="relu", name=layer_names[0], kernel_initializer = tf.initializers.he_normal(), kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=self.kernel_reg))(state_input)
d2 = tf.layers.Dense(32, activation="relu", name=layer_names[1], kernel_initializer = tf.initializers.he_normal(), kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=self.kernel_reg))(d1)
output = tf.layers.Dense(2*self.action_size, name=layer_names[2], kernel_initializer = tf.initializers.he_normal(), kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=self.kernel_reg))(d2)
for name in layer_names:
with tf.variable_scope(name, reuse=True):
tf.summary.histogram("kernel", tf.get_variable("kernel"))
tf.summary.histogram("bias", tf.get_variable("bias"))
return state_input, output
def save_variables(self, path, steps):
self.saver.save(self.sess, path, steps)
class StateValueApproximator:
def __init__(self, sess, state_size, lr, tau, alpha, kernel_reg):
self.sess = sess
self.state_size = state_size
self.kernel_reg = kernel_reg
with tf.variable_scope("V_s"):
with tf.variable_scope("model"):
self.state_input, self.value_output = self._create_model()
self.variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="V_s/model")
with tf.variable_scope("target_model"):
self.target_state_input, self.target_value_output = self._create_model()
self.target_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="V_s/target_model")
with tf.variable_scope("saver"):
self.saver = tf.train.Saver(var_list=self.variables)
with tf.variable_scope("target_copy"):
self.copy_ops = [self.target_variables[i].assign(self.target_variables[i] * (1. - tau) + self.variables[i] * tau)
for i in range(len(self.variables))]
with tf.variable_scope("loss"):
self.q_value_ph = tf.placeholder(tf.float32, [None, 1])
self.log_prob_ph = tf.placeholder(tf.float32, [None, 1])
self.loss = tf.reduce_mean(tf.square(self.value_output - self.q_value_ph + alpha * self.log_prob_ph))
self.loss_summary = tf.summary.scalar("loss", self.loss)
with tf.variable_scope("training"):
self.optimizer = tf.train.AdamOptimizer(lr)
self.grads = tf.gradients(self.loss, self.variables)
self.grads = [tf.clip_by_value(g, -1, 1) for g in self.grads]
self.clipped_grads,_ = tf.clip_by_global_norm(self.grads, GRADIENT_NORM)
grad_var_pairs = zip(self.clipped_grads, self.variables)
self.optimize = self.optimizer.apply_gradients(grad_var_pairs)
self.summaries = tf.summary.merge_all(scope="V_s/model")
def _create_model(self):
layer_names = ["layer-1", "layer-2", "layer-3"]
state_input = tf.placeholder(tf.float32, [None, self.state_size])
d1 = tf.layers.Dense(64, activation="relu", name=layer_names[0], kernel_initializer = tf.initializers.he_normal(), kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=self.kernel_reg))(state_input)
d2 = tf.layers.Dense(64, activation="relu", name=layer_names[1], kernel_initializer = tf.initializers.he_normal(), kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=self.kernel_reg))(d1)
output = tf.layers.Dense(1, name=layer_names[2], kernel_initializer = tf.initializers.he_normal(), kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=self.kernel_reg))(d2)
for name in layer_names:
with tf.variable_scope(name, reuse=True):
tf.summary.histogram("kernel", tf.get_variable("kernel"))
tf.summary.histogram("bias", tf.get_variable("bias"))
return state_input, output
def get_summaries(self):
return self.sess.run(self.summaries)
def update_target(self):
self.sess.run(self.copy_ops)
def predict(self, states):
return self.sess.run(self.value_output, feed_dict={
self.state_input: states
})
def predict_target(self, states):
return self.sess.run(self.target_value_output, feed_dict={
self.target_state_input: states
})
def train(self, states, q_values, log_probs):
loss_summary, _ = self.sess.run([self.loss_summary, self.optimize], feed_dict={
self.state_input: states,
self.q_value_ph: q_values,
self.log_prob_ph: log_probs
})
return loss_summary
def save_variables(self, path, steps):
self.saver.save(self.sess, path, steps)
class QValueApproximator:
def __init__(self, sess, name, state_size, action_size, lr, kernel_reg):
self.sess = sess
self.state_size = state_size
self.action_size = action_size
self.kernel_reg = kernel_reg
with tf.variable_scope(name):
with tf.variable_scope("model"):
self.state_input, self.action_input, self.output = self._create_model()
self.variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="{}/model".format(name))
with tf.variable_scope("saver"):
self.saver = tf.train.Saver(var_list=self.variables)
with tf.variable_scope("loss"):
self.q_target_ph = tf.placeholder(tf.float32, [None, 1])
self.loss = tf.reduce_mean(tf.square(self.output - self.q_target_ph))
self.loss_summary = tf.summary.scalar("loss", self.loss)
with tf.variable_scope("training"):
optimizer = tf.train.AdamOptimizer(lr)
self.grads = tf.gradients(self.loss, self.variables)
self.grads = [tf.clip_by_value(g, -1, 1) for g in self.grads]
self.clipped_grads,_ = tf.clip_by_global_norm(self.grads, GRADIENT_NORM)
grad_var_pairs = zip(self.clipped_grads, self.variables)
self.optimize = optimizer.apply_gradients(grad_var_pairs)
with tf.variable_scope("grad_a"):
self.q_summary = tf.summary.scalar("q_value", tf.reduce_mean(self.output))
self.grad_a_op = tf.gradients(self.output, self.action_input)[0]
self.summary_op = tf.summary.merge_all(scope=name+"/model")
def _create_model(self):
layer_names = ["layer-s1", "layer-s2", "layer-a1", "layer-m1", "layer-m2"]
state_input = tf.placeholder(tf.float32, [None, self.state_size])
ds1 = tf.layers.Dense(64, activation="relu", name=layer_names[0], kernel_initializer =tf.initializers.he_normal(), kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=self.kernel_reg))(state_input)
merge_s = tf.layers.Dense(64, name=layer_names[1], kernel_initializer = tf.initializers.he_normal(), kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=self.kernel_reg))(ds1)
action_input = tf.placeholder(tf.float32, [None, self.action_size])
merge_a = tf.layers.Dense(64, name=layer_names[2], kernel_initializer =tf.initializers.he_normal(), kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=self.kernel_reg))(action_input)
merged = tf.add(merge_a, merge_s)
dm1 = tf.layers.Dense(64, activation="relu", name=layer_names[3], kernel_initializer = tf.initializers.he_normal(), kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=self.kernel_reg))(merged)
output = tf.layers.Dense(1, name=layer_names[4], kernel_initializer = tf.initializers.he_normal(), kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=self.kernel_reg))(dm1)
for name in layer_names:
with tf.variable_scope(name, reuse=True):
tf.summary.histogram("kernel", tf.get_variable("kernel"))
tf.summary.histogram("bias", tf.get_variable("bias"))
return state_input, action_input, output
def predict(self, states, actions):
return self.sess.run(self.output, feed_dict={
self.state_input: states,
self.action_input: actions
})
def get_summaries(self):
return self.sess.run(self.summary_op)
def grad_a(self, states, actions):
return self.sess.run([self.grad_a_op, self.q_summary], feed_dict={
self.state_input: states,
self.action_input: actions
})
def train(self, states, actions, q_targets):
loss_summary, _ =self.sess.run([self.loss_summary, self.optimize], feed_dict={
self.state_input: states,
self.action_input: actions,
self.q_target_ph: q_targets
})
return loss_summary
def save_variables(self, path, steps):
self.saver.save(self.sess, path, steps)
# HYPERPARAMETERS
BATCH_SIZE = 256
ALPHA = 0.1 # has no effects
LR = 3e-5
# These hyperparameters can be left as they are
TAU = 0.005
GAMMA = 0.99
POL_REG = 1
KERNEL_REG = 0.1
ACTION_BOUND = 1
#ACTION_SIZE = 6
#STATE_SIZE = 17
tb_verbose = False
with tf.Session() as sess:
#env = gym.make("Humanoid-v1")
env = gym.make("HalfCheetah-v1")
ACTION_SIZE = env.action_space.shape[0]
STATE_SIZE = env.observation_space.shape[0]
vs = StateValueApproximator(sess, STATE_SIZE, LR, TAU, ALPHA,KERNEL_REG)
pol = Policy(sess, STATE_SIZE, ACTION_SIZE, LR, BATCH_SIZE, ALPHA, POL_REG, KERNEL_REG)
q1 = QValueApproximator(sess, "Q1", STATE_SIZE, ACTION_SIZE, LR, KERNEL_REG)
q2 = QValueApproximator(sess, "Q2", STATE_SIZE, ACTION_SIZE, LR, KERNEL_REG)
now =datetime.now()
# SET PATH
writer = tf.summary.FileWriter("./tb/c:{}-a:{}-lr:{}#{}".format(COMMENT, ALPHA, LR, now.strftime("%H:%M:%S")), sess.graph)
buffer = deque(maxlen=200000)
summary_val = tf.placeholder(tf.float32, [])
reward_summary = tf.summary.scalar("reward", summary_val)
init = tf.global_variables_initializer()
sess.run(init)
i_episode = 0
steps = 0
while True:
state = env.reset()
done = False
episode_reward = 0
i_episode += 1
while not done:
steps += 1
env.render()
action_array, _, _, _ = pol.predict(np.reshape(state, [1, -1]))
action = action_array[0] * ACTION_BOUND
next_state, reward, done, _ = env.step(action)
episode_reward += reward
buffer.append((state, action, reward, next_state, done))
if len(buffer) < BATCH_SIZE:
continue
batch = sample(buffer, BATCH_SIZE)
states = np.array([s[0] for s in batch])
actions = np.array([s[1] for s in batch])
rewards = np.reshape([s[2] for s in batch], [-1 ,1])
next_states = np.array([s[3] for s in batch])
dones = np.reshape([s[4] for s in batch], [-1, 1])
var_summaries = []
loss_summaries = []
on_policy_actions, log_probs,_,_ = pol.predict(states)
qval1 = q1.predict(states, on_policy_actions)
qval2 = q2.predict(states, on_policy_actions)
qvalmin = np.minimum(qval1, qval2)
v_loss_summary = vs.train(states, qvalmin, log_probs)
loss_summaries.append(v_loss_summary)
v_targets = vs.predict_target(next_states)
q_targets = rewards + GAMMA * v_targets * np.logical_not(dones)
q1loss = q1.train(states, actions, q_targets)
q2loss = q2.train(states, actions, q_targets)
loss_summaries.extend([q1loss, q2loss])
grad_as, q_summary = q1.grad_a(states, on_policy_actions)
loss_summaries.append(q_summary)
entropy_summary, pol_grad_summaries = pol.train(states, on_policy_actions, grad_as)
loss_summaries.append(entropy_summary)
vs.update_target()
for ls in loss_summaries:
writer.add_summary(ls, steps)
if tb_verbose:
var_summaries.extend(pol_grad_summaries)
var_summaries.extend([
pol.get_summaries(),
q1.get_summaries(),
q2.get_summaries(),
vs.get_summaries()
])
for summary in var_summaries:
writer.add_summary(summary, steps)
state = next_state
rew_sum = sess.run(reward_summary, feed_dict={
summary_val: episode_reward
})
writer.add_summary(rew_sum, steps)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment