domluna/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Run with defaults
python vpg.py

  
## vpg.py
"""
Policy Gradients

1. Sample paths.
2. Process paths (compute advantage, baseline, rewards, etc)
3. Run the paths through the policy (function approximator)
4. Compute gradients/update policy model weights
5. Profit?!?!

How we optimize the policy
--------------------------

L(theta) = sum t=0 to T-1 log policy(action_t | state_t, theta) * A_t
R_t = (sum u=t to T reward_u)
B_t = E [ sum u=t to T lambda^(u-t) * reward_u | state_t]
A_t = R_t - B_t

R_t = reward
A_t = advantage
B_t = baseline
theta = parameters of our policy, most like neural network weights.

The baseline can be thought of as the value function (V). When we evaluate the baseline
of a state we're predict how good our future returns will be given our current state.
So, intuitively if A_t > 0 that means the path we sampled is better than the expectation of
paths from the current state. Likewise, if A_t < 0, it's worse. Concretely, if A_t > 0 we want
more paths like that, if A_t < 0 we want less paths like that. Theta will be updated during training
to reflect this.


Types of parameterized policies
-------------------------------

Map s (state) to an output vector u

1. If the action is from a discrete set, the network maps s to a vector of probabilities (softmax)
2. If the action is continuous, then we map s to the mean/variance of a Gaussian distribution
(diagonal covariance that does not depend on s)
3. If a is binary valued, we use a single output, the probability of outputting 1 (although
we could also just use 1.)

TODO: implement baseline
TODO: implement generalized advantage estimation
"""
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

from six.moves import range
from gym.spaces import Box, Discrete
from scipy.signal import lfilter

import gym
import tensorflow as tf
import numpy as np
import argparse

def flatten_space(space):
    if isinstance(space, Box):
        return np.prod(space.shape)
    elif isinstance(space, Discrete):
        return space.n
    else:
        raise ValueError("Env must be either Box or Discrete.")

def discount_cumsum(x, gamma):
    return lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]


class CategoricalPolicy(object):
    def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):

        # Placeholder Inputs
        self._observations = tf.placeholder(tf.float32, shape=[None, in_dim], name="observations")
        self._actions = tf.placeholder(tf.int32, name="actions")
        self._advantages = tf.placeholder(tf.float32, name="advantages")

        self._opt = optimizer
        self._sess = session

        h1 = tf.contrib.layers.fully_connected(self._observations, hidden_dim, activation_fn=tf.tanh)
        probs = tf.contrib.layers.fully_connected(h1, out_dim, activation_fn=tf.nn.softmax)

        # I believe this is faster if on the CPU
        with tf.device("/cpu:0"):
            # NOTE: Doesn't currently work due to gather_nd gradient not being currently implemented
            # inds = tf.transpose(tf.pack([tf.range(tf.shape(probs)[0]), self._actions]))
            # log_lik = tf.log(tf.gather_nd(probs, inds))

            idxs_flattened = tf.range(0, tf.shape(probs)[0]) * tf.shape(probs)[1] + self._actions
            probs_vec = tf.gather(tf.reshape(probs, [-1]), idxs_flattened)

        log_lik = tf.log(probs_vec + 1e-8)

        act_op = probs[0, :]
        surr_loss = -tf.reduce_mean(log_lik * self._advantages, name="loss_op")

        grads_and_vars = self._opt.compute_gradients(surr_loss)
        train_op = self._opt.apply_gradients(grads_and_vars, name="train_op")

        self._act_op = act_op
        self._loss_op = surr_loss
        self._train_op = train_op

    def act(self, observation):
        # expect observation to be shape(1, self.observation_space)
        a = self._sess.run(self._act_op, feed_dict={self._observations: observation})
        cs = np.cumsum(a)
        idx = sum(cs < np.random.rand(len(cs)))
        return idx

    def train(self, observations, actions, advantages):
        loss, _ = self._sess.run([self._loss_op, self._train_op], feed_dict={self._observations:observations, self._actions:actions, self._advantages:advantages})
        return loss


class PolicyOptimizer(object):
    def __init__(self, env, policy, baseline, n_iter, n_episode, path_length,
        gamma=.99):

        self.policy = policy
        self.baseline = baseline
        self.env = env
        self.n_iter = n_iter
        self.n_episode = n_episode
        self.path_length = path_length
        self.gamma = gamma

    def sample_path(self):
        obs = []
        actions = []
        rewards = []
        ob = self.env.reset()

        for _ in range(self.path_length):
            a = self.policy.act(ob.reshape(1, -1))
            next_ob, r, done, _ = self.env.step(a)
            obs.append(ob)
            actions.append(a)
            rewards.append(r)
            ob = next_ob
            if done:
                break

        return dict(
            observations=np.array(obs),
            actions=np.array(actions),
            rewards=np.array(rewards),
        )

    def process_paths(self, paths):
        for p in paths:
            # TODO: compute baseline
            # b = self.baseline.predict(p)
            b = 0
            r = discount_cumsum(p["rewards"], self.gamma)
            a = r - b

            p["returns"] = r
            # p["advantages"] = (a - a.mean()) / (a.std() + 1e-8) # normalize
            p["advantages"] = a
            p["baselines"] = b

        obs = np.concatenate([ p["observations"] for p in paths ])
        actions = np.concatenate([ p["actions"] for p in paths ])
        rewards = np.concatenate([ p["rewards"] for p in paths ])
        advantages = np.concatenate([ p["advantages"] for p in paths ])

        # TODO: fit baseline
        # self.baseline.fit(paths)

        return dict(
            observations=obs,
            actions=actions,
            rewards=rewards,
            advantages=advantages,
        )


    def train(self):
        for i in range(1, self.n_iter+1):
            paths = []
            for _ in range(self.n_episode):
                paths.append(self.sample_path())
            data = self.process_paths(paths)
            loss = self.policy.train(data["observations"], data["actions"], data["advantages"])
            avg_return = np.mean([sum(p["rewards"]) for p in paths])
            print("Iteration {}: Loss = {}, Average Return = {}".format(i, loss, avg_return))

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--n_iter', default=100, type=int, help='number of iterations')
    parser.add_argument('--n_episode', default=100, type=int, help='number of episodes/iteration')
    parser.add_argument('--path_length', default=200, type=int, help='number of steps')
    parser.add_argument('--learning_rate', default=0.01, help='learning rate for Adam Optimizer')
    parser.add_argument('--env', default='CartPole-v0', help='gym environment for training')
    parser.add_argument('--algorithm', default='VPG', help='algorithm identifier')
    parser.add_argument('--outdir', default='vpg', type=str, help='output directory where results are saved (/tmp/{outdir}-{env} )')
    parser.add_argument('--upload', action='store_true', help='upload results via OpenAI Gym API')
    parser.add_argument('--seed', default=0, type=int, help='random seed')
    args = parser.parse_args()

    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    env = gym.make(args.env)
    outdir = '/tmp/' + args.outdir + '-' + args.env
    env.monitor.start(outdir, force=True)

    print("******* WILL SAVE RESULTS TO", outdir, " *******")

    sess = tf.Session()

    in_dim = flatten_space(env.observation_space)
    out_dim = flatten_space(env.action_space)
    hidden_dim = 8

    opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
    policy = CategoricalPolicy(in_dim, out_dim, hidden_dim, opt, sess)
    po = PolicyOptimizer(env, policy, 0, args.n_iter, args.n_episode, args.path_length)

    sess.run(tf.initialize_all_variables())

    # train the policy optimizer
    po.train()

    env.monitor.close()

    # make sure to setup your OPENAI_GYM_API_KEY environment variable
    if args.upload:
        gym.upload(outdir, algorithm_id=args.algorithm)
	"""
	Policy Gradients

	1. Sample paths.
	2. Process paths (compute advantage, baseline, rewards, etc)
	3. Run the paths through the policy (function approximator)
	4. Compute gradients/update policy model weights
	5. Profit?!?!

	How we optimize the policy
	--------------------------

	L(theta) = sum t=0 to T-1 log policy(action_t \| state_t, theta) * A_t
	R_t = (sum u=t to T reward_u)
	B_t = E [ sum u=t to T lambda^(u-t) * reward_u \| state_t]
	A_t = R_t - B_t

	R_t = reward
	A_t = advantage
	B_t = baseline
	theta = parameters of our policy, most like neural network weights.

	The baseline can be thought of as the value function (V). When we evaluate the baseline
	of a state we're predict how good our future returns will be given our current state.
	So, intuitively if A_t > 0 that means the path we sampled is better than the expectation of
	paths from the current state. Likewise, if A_t < 0, it's worse. Concretely, if A_t > 0 we want
	more paths like that, if A_t < 0 we want less paths like that. Theta will be updated during training
	to reflect this.


	Types of parameterized policies
	-------------------------------

	Map s (state) to an output vector u

	1. If the action is from a discrete set, the network maps s to a vector of probabilities (softmax)
	2. If the action is continuous, then we map s to the mean/variance of a Gaussian distribution
	(diagonal covariance that does not depend on s)
	3. If a is binary valued, we use a single output, the probability of outputting 1 (although
	we could also just use 1.)

	TODO: implement baseline
	TODO: implement generalized advantage estimation
	"""
	from __future__ import absolute_import
	from __future__ import print_function
	from __future__ import division

	from six.moves import range
	from gym.spaces import Box, Discrete
	from scipy.signal import lfilter

	import gym
	import tensorflow as tf
	import numpy as np
	import argparse

	def flatten_space(space):
	if isinstance(space, Box):
	return np.prod(space.shape)
	elif isinstance(space, Discrete):
	return space.n
	else:
	raise ValueError("Env must be either Box or Discrete.")

	def discount_cumsum(x, gamma):
	return lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]


	class CategoricalPolicy(object):
	def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):

	# Placeholder Inputs
	self._observations = tf.placeholder(tf.float32, shape=[None, in_dim], name="observations")
	self._actions = tf.placeholder(tf.int32, name="actions")
	self._advantages = tf.placeholder(tf.float32, name="advantages")

	self._opt = optimizer
	self._sess = session

	h1 = tf.contrib.layers.fully_connected(self._observations, hidden_dim, activation_fn=tf.tanh)
	probs = tf.contrib.layers.fully_connected(h1, out_dim, activation_fn=tf.nn.softmax)

	# I believe this is faster if on the CPU
	with tf.device("/cpu:0"):
	# NOTE: Doesn't currently work due to gather_nd gradient not being currently implemented
	# inds = tf.transpose(tf.pack([tf.range(tf.shape(probs)[0]), self._actions]))
	# log_lik = tf.log(tf.gather_nd(probs, inds))

	idxs_flattened = tf.range(0, tf.shape(probs)[0]) * tf.shape(probs)[1] + self._actions
	probs_vec = tf.gather(tf.reshape(probs, [-1]), idxs_flattened)

	log_lik = tf.log(probs_vec + 1e-8)

	act_op = probs[0, :]
	surr_loss = -tf.reduce_mean(log_lik * self._advantages, name="loss_op")

	grads_and_vars = self._opt.compute_gradients(surr_loss)
	train_op = self._opt.apply_gradients(grads_and_vars, name="train_op")

	self._act_op = act_op
	self._loss_op = surr_loss
	self._train_op = train_op

	def act(self, observation):
	# expect observation to be shape(1, self.observation_space)
	a = self._sess.run(self._act_op, feed_dict={self._observations: observation})
	cs = np.cumsum(a)
	idx = sum(cs < np.random.rand(len(cs)))
	return idx

	def train(self, observations, actions, advantages):
	loss, _ = self._sess.run([self._loss_op, self._train_op], feed_dict={self._observations:observations, self._actions:actions, self._advantages:advantages})
	return loss


	class PolicyOptimizer(object):
	def __init__(self, env, policy, baseline, n_iter, n_episode, path_length,
	gamma=.99):

	self.policy = policy
	self.baseline = baseline
	self.env = env
	self.n_iter = n_iter
	self.n_episode = n_episode
	self.path_length = path_length
	self.gamma = gamma

	def sample_path(self):
	obs = []
	actions = []
	rewards = []
	ob = self.env.reset()

	for _ in range(self.path_length):
	a = self.policy.act(ob.reshape(1, -1))
	next_ob, r, done, _ = self.env.step(a)
	obs.append(ob)
	actions.append(a)
	rewards.append(r)
	ob = next_ob
	if done:
	break

	return dict(
	observations=np.array(obs),
	actions=np.array(actions),
	rewards=np.array(rewards),
	)

	def process_paths(self, paths):
	for p in paths:
	# TODO: compute baseline
	# b = self.baseline.predict(p)
	b = 0
	r = discount_cumsum(p["rewards"], self.gamma)
	a = r - b

	p["returns"] = r
	# p["advantages"] = (a - a.mean()) / (a.std() + 1e-8) # normalize
	p["advantages"] = a
	p["baselines"] = b

	obs = np.concatenate([ p["observations"] for p in paths ])
	actions = np.concatenate([ p["actions"] for p in paths ])
	rewards = np.concatenate([ p["rewards"] for p in paths ])
	advantages = np.concatenate([ p["advantages"] for p in paths ])

	# TODO: fit baseline
	# self.baseline.fit(paths)

	return dict(
	observations=obs,
	actions=actions,
	rewards=rewards,
	advantages=advantages,
	)


	def train(self):
	for i in range(1, self.n_iter+1):
	paths = []
	for _ in range(self.n_episode):
	paths.append(self.sample_path())
	data = self.process_paths(paths)
	loss = self.policy.train(data["observations"], data["actions"], data["advantages"])
	avg_return = np.mean([sum(p["rewards"]) for p in paths])
	print("Iteration {}: Loss = {}, Average Return = {}".format(i, loss, avg_return))

	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('--n_iter', default=100, type=int, help='number of iterations')
	parser.add_argument('--n_episode', default=100, type=int, help='number of episodes/iteration')
	parser.add_argument('--path_length', default=200, type=int, help='number of steps')
	parser.add_argument('--learning_rate', default=0.01, help='learning rate for Adam Optimizer')
	parser.add_argument('--env', default='CartPole-v0', help='gym environment for training')
	parser.add_argument('--algorithm', default='VPG', help='algorithm identifier')
	parser.add_argument('--outdir', default='vpg', type=str, help='output directory where results are saved (/tmp/{outdir}-{env} )')
	parser.add_argument('--upload', action='store_true', help='upload results via OpenAI Gym API')
	parser.add_argument('--seed', default=0, type=int, help='random seed')
	args = parser.parse_args()

	np.random.seed(args.seed)
	tf.set_random_seed(args.seed)

	env = gym.make(args.env)
	outdir = '/tmp/' + args.outdir + '-' + args.env
	env.monitor.start(outdir, force=True)

	print("***** WILL SAVE RESULTS TO", outdir, " *****")

	sess = tf.Session()

	in_dim = flatten_space(env.observation_space)
	out_dim = flatten_space(env.action_space)
	hidden_dim = 8

	opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
	policy = CategoricalPolicy(in_dim, out_dim, hidden_dim, opt, sess)
	po = PolicyOptimizer(env, policy, 0, args.n_iter, args.n_episode, args.path_length)

	sess.run(tf.initialize_all_variables())

	# train the policy optimizer
	po.train()

	env.monitor.close()

	# make sure to setup your OPENAI_GYM_API_KEY environment variable
	if args.upload:
	gym.upload(outdir, algorithm_id=args.algorithm)