Skip to content

Instantly share code, notes, and snippets.

@domluna
Last active February 28, 2018 12:08
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save domluna/529d3e7b51fe7e2589be71dd9d2ace4e to your computer and use it in GitHub Desktop.
Save domluna/529d3e7b51fe7e2589be71dd9d2ace4e to your computer and use it in GitHub Desktop.
Vanilla policy gradient, no baseline

Run with defaults

python vpg.py

"""
Policy Gradients
1. Sample paths.
2. Process paths (compute advantage, baseline, rewards, etc)
3. Run the paths through the policy (function approximator)
4. Compute gradients/update policy model weights
5. Profit?!?!
How we optimize the policy
--------------------------
L(theta) = sum t=0 to T-1 log policy(action_t | state_t, theta) * A_t
R_t = (sum u=t to T reward_u)
B_t = E [ sum u=t to T lambda^(u-t) * reward_u | state_t]
A_t = R_t - B_t
R_t = reward
A_t = advantage
B_t = baseline
theta = parameters of our policy, most like neural network weights.
The baseline can be thought of as the value function (V). When we evaluate the baseline
of a state we're predict how good our future returns will be given our current state.
So, intuitively if A_t > 0 that means the path we sampled is better than the expectation of
paths from the current state. Likewise, if A_t < 0, it's worse. Concretely, if A_t > 0 we want
more paths like that, if A_t < 0 we want less paths like that. Theta will be updated during training
to reflect this.
Types of parameterized policies
-------------------------------
Map s (state) to an output vector u
1. If the action is from a discrete set, the network maps s to a vector of probabilities (softmax)
2. If the action is continuous, then we map s to the mean/variance of a Gaussian distribution
(diagonal covariance that does not depend on s)
3. If a is binary valued, we use a single output, the probability of outputting 1 (although
we could also just use 1.)
TODO: implement baseline
TODO: implement generalized advantage estimation
"""
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
from six.moves import range
from gym.spaces import Box, Discrete
from scipy.signal import lfilter
import gym
import tensorflow as tf
import numpy as np
import argparse
def flatten_space(space):
if isinstance(space, Box):
return np.prod(space.shape)
elif isinstance(space, Discrete):
return space.n
else:
raise ValueError("Env must be either Box or Discrete.")
def discount_cumsum(x, gamma):
return lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
class CategoricalPolicy(object):
def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
# Placeholder Inputs
self._observations = tf.placeholder(tf.float32, shape=[None, in_dim], name="observations")
self._actions = tf.placeholder(tf.int32, name="actions")
self._advantages = tf.placeholder(tf.float32, name="advantages")
self._opt = optimizer
self._sess = session
h1 = tf.contrib.layers.fully_connected(self._observations, hidden_dim, activation_fn=tf.tanh)
probs = tf.contrib.layers.fully_connected(h1, out_dim, activation_fn=tf.nn.softmax)
# I believe this is faster if on the CPU
with tf.device("/cpu:0"):
# NOTE: Doesn't currently work due to gather_nd gradient not being currently implemented
# inds = tf.transpose(tf.pack([tf.range(tf.shape(probs)[0]), self._actions]))
# log_lik = tf.log(tf.gather_nd(probs, inds))
idxs_flattened = tf.range(0, tf.shape(probs)[0]) * tf.shape(probs)[1] + self._actions
probs_vec = tf.gather(tf.reshape(probs, [-1]), idxs_flattened)
log_lik = tf.log(probs_vec + 1e-8)
act_op = probs[0, :]
surr_loss = -tf.reduce_mean(log_lik * self._advantages, name="loss_op")
grads_and_vars = self._opt.compute_gradients(surr_loss)
train_op = self._opt.apply_gradients(grads_and_vars, name="train_op")
self._act_op = act_op
self._loss_op = surr_loss
self._train_op = train_op
def act(self, observation):
# expect observation to be shape(1, self.observation_space)
a = self._sess.run(self._act_op, feed_dict={self._observations: observation})
cs = np.cumsum(a)
idx = sum(cs < np.random.rand(len(cs)))
return idx
def train(self, observations, actions, advantages):
loss, _ = self._sess.run([self._loss_op, self._train_op], feed_dict={self._observations:observations, self._actions:actions, self._advantages:advantages})
return loss
class PolicyOptimizer(object):
def __init__(self, env, policy, baseline, n_iter, n_episode, path_length,
gamma=.99):
self.policy = policy
self.baseline = baseline
self.env = env
self.n_iter = n_iter
self.n_episode = n_episode
self.path_length = path_length
self.gamma = gamma
def sample_path(self):
obs = []
actions = []
rewards = []
ob = self.env.reset()
for _ in range(self.path_length):
a = self.policy.act(ob.reshape(1, -1))
next_ob, r, done, _ = self.env.step(a)
obs.append(ob)
actions.append(a)
rewards.append(r)
ob = next_ob
if done:
break
return dict(
observations=np.array(obs),
actions=np.array(actions),
rewards=np.array(rewards),
)
def process_paths(self, paths):
for p in paths:
# TODO: compute baseline
# b = self.baseline.predict(p)
b = 0
r = discount_cumsum(p["rewards"], self.gamma)
a = r - b
p["returns"] = r
# p["advantages"] = (a - a.mean()) / (a.std() + 1e-8) # normalize
p["advantages"] = a
p["baselines"] = b
obs = np.concatenate([ p["observations"] for p in paths ])
actions = np.concatenate([ p["actions"] for p in paths ])
rewards = np.concatenate([ p["rewards"] for p in paths ])
advantages = np.concatenate([ p["advantages"] for p in paths ])
# TODO: fit baseline
# self.baseline.fit(paths)
return dict(
observations=obs,
actions=actions,
rewards=rewards,
advantages=advantages,
)
def train(self):
for i in range(1, self.n_iter+1):
paths = []
for _ in range(self.n_episode):
paths.append(self.sample_path())
data = self.process_paths(paths)
loss = self.policy.train(data["observations"], data["actions"], data["advantages"])
avg_return = np.mean([sum(p["rewards"]) for p in paths])
print("Iteration {}: Loss = {}, Average Return = {}".format(i, loss, avg_return))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--n_iter', default=100, type=int, help='number of iterations')
parser.add_argument('--n_episode', default=100, type=int, help='number of episodes/iteration')
parser.add_argument('--path_length', default=200, type=int, help='number of steps')
parser.add_argument('--learning_rate', default=0.01, help='learning rate for Adam Optimizer')
parser.add_argument('--env', default='CartPole-v0', help='gym environment for training')
parser.add_argument('--algorithm', default='VPG', help='algorithm identifier')
parser.add_argument('--outdir', default='vpg', type=str, help='output directory where results are saved (/tmp/{outdir}-{env} )')
parser.add_argument('--upload', action='store_true', help='upload results via OpenAI Gym API')
parser.add_argument('--seed', default=0, type=int, help='random seed')
args = parser.parse_args()
np.random.seed(args.seed)
tf.set_random_seed(args.seed)
env = gym.make(args.env)
outdir = '/tmp/' + args.outdir + '-' + args.env
env.monitor.start(outdir, force=True)
print("******* WILL SAVE RESULTS TO", outdir, " *******")
sess = tf.Session()
in_dim = flatten_space(env.observation_space)
out_dim = flatten_space(env.action_space)
hidden_dim = 8
opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
policy = CategoricalPolicy(in_dim, out_dim, hidden_dim, opt, sess)
po = PolicyOptimizer(env, policy, 0, args.n_iter, args.n_episode, args.path_length)
sess.run(tf.initialize_all_variables())
# train the policy optimizer
po.train()
env.monitor.close()
# make sure to setup your OPENAI_GYM_API_KEY environment variable
if args.upload:
gym.upload(outdir, algorithm_id=args.algorithm)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment