Skip to content

Instantly share code, notes, and snippets.

@greydanus
Last active January 29, 2019 23:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save greydanus/7cef68683ec955720ddde6b3edf8820e to your computer and use it in GitHub Desktop.
Save greydanus/7cef68683ec955720ddde6b3edf8820e to your computer and use it in GitHub Desktop.
Solution to the Cartpole problem using Policy Gradients in TensorFlow
'''Solution to the Cartpole problem using Policy Gradients in Tensorflow.'''
# written October 2016 by Sam Greydanus
# inspired by gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5
import numpy as np
import gym
import tensorflow as tf
# hyperparameters
n_obs = 4 # dimensionality of observations
h = 128 # hidden layer neurons
n_actions = 2 # number of available actions
learning_rate = 1e-2 # how rapidly to update parameters
gamma = .9 # reward discount factor
decay = 0.9 # decay rate for RMSProp gradients
# gamespace
env = gym.make("CartPole-v0")
observation = env.reset()
xs,rs,ys = [],[],[] # environment info
running_reward = 10 # worst case is ~10 for cartpole
reward_sum = 0
episode_number = 0
max_steps = 1000 # should converge around 300
# initialize model
tf_model = {}
with tf.variable_scope('layer_one',reuse=False):
xavier_l1 = tf.truncated_normal_initializer(mean=0, stddev=1./np.sqrt(n_obs), dtype=tf.float32)
tf_model['W1'] = tf.get_variable("W1", [n_obs, h], initializer=xavier_l1)
with tf.variable_scope('layer_two',reuse=False):
xavier_l2 = tf.truncated_normal_initializer(mean=0, stddev=1./np.sqrt(h), dtype=tf.float32)
tf_model['W2'] = tf.get_variable("W2", [h,n_actions], initializer=xavier_l2)
# tensorflow operations
def tf_discount_rewards(tf_r): #tf_r ~ [game_steps,1]
discount_f = lambda a, v: a*gamma + v;
tf_r_reverse = tf.scan(discount_f, tf.reverse(tf_r,[True, False]))
tf_discounted_r = tf.reverse(tf_r_reverse,[True, False])
return tf_discounted_r
def tf_policy_forward(x): #x ~ [1,D]
h = tf.matmul(x, tf_model['W1'])
h = tf.nn.relu(h)
logp = tf.matmul(h, tf_model['W2'])
p = tf.nn.softmax(logp)
return p
# tf placeholders
tf_x = tf.placeholder(dtype=tf.float32, shape=[None, n_obs],name="tf_x")
tf_y = tf.placeholder(dtype=tf.float32, shape=[None, n_actions],name="tf_y")
tf_epr = tf.placeholder(dtype=tf.float32, shape=[None,1], name="tf_epr")
# tf reward processing (need tf_discounted_epr for policy gradient wizardry)
tf_discounted_epr = tf_discount_rewards(tf_epr)
tf_mean, tf_variance= tf.nn.moments(tf_discounted_epr, [0], shift=None, name="reward_moments")
tf_discounted_epr -= tf_mean
tf_discounted_epr /= tf.sqrt(tf_variance + 1e-6)
# tf optimizer op
tf_aprob = tf_policy_forward(tf_x)
loss = tf.nn.l2_loss(tf_y-tf_aprob) # this gradient encourages the actions taken
optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=decay)
tf_grads = optimizer.compute_gradients(loss, var_list=tf.trainable_variables(), grad_loss=tf_discounted_epr)
train_op = optimizer.apply_gradients(tf_grads)
# tf graph initialization
sess = tf.InteractiveSession()
tf.initialize_all_variables().run()
# training loop
# stop when running reward exceeds 200 (task is considered solved)
while episode_number <= max_steps and running_reward < 200:
# if episode_number%50==0: env.render()
# stochastically sample a policy from the network
x = observation
feed = {tf_x: np.reshape(x, (1,-1))}
aprob = sess.run(tf_aprob,feed)
aprob = aprob[0,:] # we live in a batched world :/
action = np.random.choice(n_actions, p=aprob)
label = np.zeros_like(aprob) ; label[action] = 1 # make a training 'label'
# step the environment and get new measurements
observation, reward, done, info = env.step(action)
reward_sum += reward
# record game history
xs.append(x)
ys.append(label)
rs.append(reward)
if done:
running_reward = running_reward * 0.99 + reward_sum * 0.01
epx = np.vstack(xs)
epr = np.vstack(rs)
epy = np.vstack(ys)
xs,rs,ys = [],[],[] # reset game history
feed = {tf_x: epx, tf_epr: epr, tf_y: epy}
_ = sess.run(train_op,feed) # parameter update
# print some updates
if episode_number % 25 == 0:
print 'ep: {}, reward: {}, mean reward: {:3f}'.format(
episode_number, reward_sum, running_reward)
# book-keeping
episode_number += 1
observation = env.reset() # reset env
reward_sum = 0
if running_reward > 200:
print "ep: {}: SOLVED! (running reward hit {} which is greater than 200)".format(
episode_number, running_reward)
else:
print "ep: {}: model did not converge. Try changing the hyperparameters.".format(episode_number)
@bfrasure
Copy link

For some reason I am getting "model did not converge. Try changing the hyperparameters" with the default script. Python 3.5.2 Windows. Could there it be a difference in the MKL?

@parajain
Copy link

I did not get what is happening in line 82
label = np.zeros_like(aprob) ; label[action] = 1 # make a training 'label'
Can you please add some explanation. That would help. I have gone through the pg derivation but cannot relate.

Thanks

@yutingsz
Copy link

Thanks for the codes. Very interesting.
Here I have a question regarding the line 63:
tf_grads = optimizer.compute_gradients(loss, var_list=tf.trainable_variables(), grad_loss=tf_discounted_epr)
What is the purpose of the parameter, grad_loss=tf_discounted_epr.
The TF document says: "A Tensor holding the gradient computed for loss". May I ask what is the difference between the gradient for loss and gradient for valuables?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment