Last active
March 21, 2017 00:32
-
-
Save flyman3046/e09741d3c3240be4af172001395f5e4d to your computer and use it in GitHub Desktop.
Solve CartPole with tensorflow
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Solve CartPole-v0 | |
import tensorflow as tf | |
import numpy as np | |
import gym | |
import matplotlib.pyplot as plt | |
# hyperparameters | |
H = 10 # number of hidden layer neurons | |
learning_rate = 1e-3 | |
gamma = 0.99 # discount factor for reward | |
# model initialization | |
D = 4 # input dimensionality | |
C = 2 # class number | |
def policy_gradient(): | |
with tf.variable_scope("policy"): | |
state = tf.placeholder(tf.float32, [None, D]) | |
actions = tf.placeholder(tf.int32, [None, 1]) | |
advantages = tf.placeholder(tf.float32, [None, 1]) | |
params_w1 = tf.get_variable("policy_parameters_w1",[D, H]) | |
params_b1 = tf.get_variable("policy_parameters_b1", [H]) | |
params_w2 = tf.get_variable("policy_parameters_w2",[H, C]) | |
params_b2 = tf.get_variable("policy_parameters_b2", [C]) | |
hidden = tf.nn.relu(tf.matmul(state, params_w1) + params_b1) | |
probabilities = tf.nn.softmax(tf.matmul(hidden, params_w2) + params_b2) | |
prob_given_state = tf.reduce_sum(-tf.log(probabilities) * tf.one_hot(actions, C), axis=0) | |
loss = tf.reduce_mean(prob_given_state * advantages) | |
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss) | |
return probabilities, state, actions, advantages, optimizer | |
def discount_rewards(r): | |
""" take 1D float array of rewards and compute discounted reward """ | |
discounted_r = np.zeros_like(r) | |
running_add = 0 | |
for t in reversed(xrange(0, len(r))): | |
running_add = running_add * gamma + r[t] | |
discounted_r[t] = running_add | |
discounted_r -= np.mean(discounted_r) | |
discounted_r /= np.std(discounted_r) | |
return discounted_r | |
def choose_action(prob): | |
action = np.random.choice(range(len(prob)), p=prob) # select action w.r.t the actions prob | |
return action | |
env = gym.make("CartPole-v0") | |
policy_grad = policy_gradient() | |
sess = tf.InteractiveSession() | |
init = tf.global_variables_initializer() | |
sess.run(init) | |
reward_sum = 0 | |
reward_trend = [] | |
for episode_number in range(3000): | |
observation = env.reset() | |
feed_states, feed_actions, feed_reward = [], [], [] | |
reward_sum = 0 | |
pl_calculated, pl_state, pl_actions, pl_advantages, pl_optimizer = policy_grad | |
for _ in range(300): | |
state = observation #shape (D,) | |
aprob = sess.run(pl_calculated, feed_dict={pl_state: np.reshape(state, (1, D))}) # aprob's shape: 1 * C | |
action = choose_action(aprob[0]) # select an action based on policy gradient | |
feed_states.append(state) | |
feed_actions.append(action) | |
# step the environment and get new measurements | |
observation, reward, done, info = env.step(action) | |
reward_sum += reward | |
feed_reward.append(reward) | |
if done: # an episode finished | |
if episode_number % 10 == 0: | |
print "episode is done" | |
print "reward_sum: {}".format(reward_sum) | |
reward_trend.append(reward_sum) | |
feed_advantages = discount_rewards(feed_reward) # compute discounted and normalized rewards | |
sess.run(pl_optimizer, feed_dict={pl_state: np.vstack(feed_states), pl_advantages: np.vstack(feed_advantages), pl_actions: np.vstack(feed_actions)}) | |
break | |
plt.plot(reward_trend) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment