Skip to content

Instantly share code, notes, and snippets.

@lancerts
Last active July 14, 2017 21:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lancerts/02c5aaae8abc1051b71a679b87dd4ad9 to your computer and use it in GitHub Desktop.
Save lancerts/02c5aaae8abc1051b71a679b87dd4ad9 to your computer and use it in GitHub Desktop.
CartPole-v0
import numpy as np
import tensorflow as tf
import gym
from gym import wrappers
np.random.seed(1)
tf.set_random_seed(1)
env = gym.make('CartPole-v0')
env = wrappers.Monitor(env, '/tmp/cartpole-experiment-1', force=True)
# hyperparameters
H = 20 # number of hidden layer neurons
batch_size = 1 # every how many episodes to do a param update?
learning_rate = 5e-2
gamma = 0.999 # discount factor for reward
D = 4 # input dimensionality
tf.reset_default_graph()
#This defines the network as it goes from taking an observation of the environment to a #probability of chosing to the action of moving left or right.
observations = tf.placeholder(tf.float32, [None,D] , name="input_x")
W1 = tf.get_variable("W1", shape=[D, H],
initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.relu(tf.matmul(observations,W1))
W2 = tf.get_variable("W2", shape=[H, 1],
initializer=tf.contrib.layers.xavier_initializer())
score = tf.matmul(layer1,W2)
probability = tf.nn.sigmoid(score)
#From here we define the parts of the network needed for learning a good policy.
variables = tf.trainable_variables()
input_y = tf.placeholder(tf.float32,[None,1], name="input_y")
advantages = tf.placeholder(tf.float32,name="reward")
#The loss function. This sends the weights in the direction of making actions that gave #good advantage (reward over time) more likely, and actions that didn't less likely.
#loglik for two class problem
loglik = input_y*tf.log(probability) + (1 - input_y)*tf.log(1-probability)
#advantage weighted loglike, see http://karpathy.github.io/2016/05/31/rl/
loss = -tf.reduce_mean(loglik * advantages)
newGrads = tf.gradients(loss,variables)
#Once we have collected a series of gradients from multiple episodes, we apply them.
adam = tf.train.AdamOptimizer(learning_rate=learning_rate,beta1=0.99, beta2=0.99 )
#Adam seems to be superior than RMSProp here
#rmsprop = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.99, #momentum=0.9, epsilon=1e-6)
W1Grad = tf.placeholder(tf.float32,name="batch_grad1") # Placeholders to send the final gradients through when we update.
W2Grad = tf.placeholder(tf.float32,name="batch_grad2")
batchGrad = [W1Grad,W2Grad]
updateGrads = adam.apply_gradients(zip(batchGrad,variables))
#updateGrads = rmsprop.apply_gradients(zip(batchGrad,variables))
def discount_rewards(r):
""" take 1D float array of rewards and compute discounted reward """
discounted_r = np.zeros_like(r)
running_add = 0
for t in reversed(range(0, r.size)):
running_add = running_add * gamma + r[t]
discounted_r[t] = running_add
return discounted_r
xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 1
total_episodes = 10000
init = tf.initialize_all_variables()
# Launch the graph
with tf.Session() as sess:
rendering = False
sess.run(init)
observation = env.reset() # Obtain an initial observation of the environment
# Reset the gradient placeholder. We will collect gradients in gradBuffer until we
# are ready to update our policy network.
gradBuffer = sess.run(variables)
for ix,grad in enumerate(gradBuffer):
gradBuffer[ix] = grad * 0
while episode_number <= total_episodes:
# Make sure the observation is in a shape the network can handle.
x = np.reshape(observation,[1,D])
# Run the policy network and get an action to take.
tfprob = sess.run(probability,feed_dict={observations: x})
action = 1 if np.random.uniform() < tfprob else 0
xs.append(x) # observation
y = 1 if action == 1 else 0 # a "fake label"
ys.append(y)
# step the environment and get new measurements
observation, reward, done, info = env.step(action)
reward_sum += reward
drs.append(reward) # record reward (has to be done after we call step()
#to get reward for previous action)
if done:
episode_number += 1
# stack together all inputs, hidden states, action gradients,
# and rewards for this episode
epx = np.vstack(xs)
epy = np.vstack(ys)
epr = np.vstack(drs)
tfp = tfps
xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[] # reset array memory
# compute the discounted reward backwards through time
discounted_epr = discount_rewards(epr)
# center and scale the rewards to be unit normal (helps control the gradient estimator variance)
discounted_epr -= np.mean(discounted_epr)
discounted_epr /= np.std(discounted_epr)
# Get the gradient for this episode, and save it in the gradBuffer
tGrad = sess.run(newGrads,feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})
for ix,grad in enumerate(tGrad):
gradBuffer[ix] += grad
# If we have completed enough episodes, then update the policy network with our gradients.
if episode_number % batch_size == 0:
sess.run(updateGrads,feed_dict={W1Grad: gradBuffer[0],W2Grad:gradBuffer[1]})
for ix,grad in enumerate(gradBuffer):
gradBuffer[ix] = grad * 0
# Give a summary of how well our network is doing for each batch of episodes.
running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
print ('Average reward for episode %f. Total average reward %f.' % (reward_sum/batch_size, running_reward/batch_size))
if running_reward/batch_size > 192:
print ("Task solved in",episode_number,'episodes!')
break
reward_sum = 0
observation = env.reset()
print (episode_number,'Episodes completed.')
env.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment