Skip to content

Instantly share code, notes, and snippets.

@breeko
Created May 4, 2017 10:03
Show Gist options
  • Save breeko/2b7c85793dbf3dd565ff1c0391928916 to your computer and use it in GitHub Desktop.
Save breeko/2b7c85793dbf3dd565ff1c0391928916 to your computer and use it in GitHub Desktop.
# Policy based reinforcement learning agent used to solve openai's CartPole challenge
# https://gym.openai.com/evaluations/eval_dMY1xQiST7GXe4Br5n31w
import numpy as np
import tensorflow as tf
import gym
ENVIRONMENT = "CartPole-v0"
SEED = 0
LEARNING_RATE = 1e-2
GAMMA = 0.99
DECAY_RATE = 0.99
BATCH_SIZE = 3
NUM_HIDDEN = 10
NUM_EPISODES = 5000
MAX_LEN_EPISODE = 500
PRINT_EVERY = 100
GOAL_REWARD = 195
GOAL_NUM_EPISODES = 100
RENDER = True # Whether to render after completing
UPLOAD = True # Whether to upload to openai
if UPLOAD:
from key import api_key
np.random.seed(SEED)
tf.set_random_seed(SEED)
env = gym.make(ENVIRONMENT)
env = gym.wrappers.Monitor(env,directory="videos",force=True)
dimen = env.observation_space.shape[0]
def discount(r, gamma=0.99, standardize=False):
"""" Takes 1-d float array of rewards and computes the discount reward
e.g. f([1,1,1], 0.99) -> [1, 0.99, 0.9801]
"""
discounted = np.array([val * (gamma ** i) for i, val in enumerate(r)])
if standardize:
discounted -= np.mean(discounted)
discounted /= np.std(discounted)
return discounted
# Define neural network
tf.reset_default_graph()
input_x = tf.placeholder(tf.float32, [None, dimen], name="input_x")
# First layer
W1 = tf.get_variable("W1", shape=[dimen,NUM_HIDDEN], initializer=tf.contrib.layers.xavier_initializer())
layer_1 = tf.nn.relu(tf.matmul(input_x, W1))
# Second layer
W2 = tf.get_variable("W2", shape=[NUM_HIDDEN, 1], initializer=tf.contrib.layers.xavier_initializer())
output = tf.nn.sigmoid(tf.matmul(layer_1, W2))
# Placeholders for inputs used in training
input_y = tf.placeholder(tf.float32, shape=[None,1], name="input_y")
advantages = tf.placeholder(tf.float32, shape=[None,1], name="reward_signal")
# Loss function. Equivalent to: 0 if input_y == output else 1
log_lik = tf.log(input_y * (input_y - output) + (1 - input_y) * (input_y + output))
loss = -tf.reduce_mean(log_lik * advantages)
# Gradients
W1_grad = tf.placeholder(tf.float32, name="W1_grad")
W2_grad = tf.placeholder(tf.float32, name="W2_grad")
batch_grad = [W1_grad, W2_grad]
trainable_vars = [W1, W2]
grads = tf.gradients(loss,trainable_vars)
# Optimizer
adam_p = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
update_grads = adam_p.apply_gradients(zip(batch_grad,trainable_vars))
# Initialize and test to see if model is setup correctly
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
random_obs = np.random.random(size=[1,dimen])
random_action = env.action_space.sample()
print("obs: {}\naction: {}\noutput policy: {}".format(
random_obs,
random_action,
sess.run(output,feed_dict={input_x: random_obs})))
cum_rewards = []
# Setup arrays used to track episode performance
observations = np.empty(0).reshape(0,dimen)
rewards = np.empty(0).reshape(0,1)
actions = np.empty(0).reshape(0,1)
# Setups array used to track gradients
cum_grads = np.array([np.zeros(var.get_shape().as_list()) for var in trainable_vars])
num_episode = 0
observation = env.reset()
while num_episode < NUM_EPISODES:
observation = observation.reshape(1,-1)
# Determine policy
policy = sess.run(output, feed_dict={input_x: observation})
# Decide on an action based on policy, allowing for some randomness
action = 0 if policy > np.random.uniform() else 1
# Keep track of observations and actions
observations = np.vstack([observations, observation])
actions = np.vstack([actions, action])
observation, reward, done, _ = env.step(action)
rewards = np.vstack([rewards,reward])
if done or len(observations) > MAX_LEN_EPISODE:
cum_rewards.append(np.sum(rewards))
# Discount rewards
disc_rewards = discount(rewards,standardize=True)
# Add gradients to running batch
cum_grads += sess.run(grads, feed_dict={input_x: observations, input_y: actions, advantages: disc_rewards})
num_episode += 1
observation = env.reset()
# Reset everything
observations = np.empty(0).reshape(0,dimen)
rewards = np.empty(0).reshape(0,1)
actions = np.empty(0).reshape(0,1)
if num_episode % BATCH_SIZE == 0:
# Update gradients
sess.run(update_grads, feed_dict={W1_grad: cum_grads[0], W2_grad: cum_grads[1]})
# Reset gradients
cum_grads = np.array([np.zeros(var.get_shape().as_list()) for var in trainable_vars])
mean_rewards = np.mean(cum_rewards[-GOAL_NUM_EPISODES:])
# Print periodically
if (num_episode % (BATCH_SIZE * PRINT_EVERY)) == 0:
print("Episode: {} last batch rewards: {:0.2f}".format(
num_episode, mean_rewards))
# If our score is good enough, stop
if mean_rewards >= GOAL_REWARD and num_episode >= GOAL_NUM_EPISODES:
print("Episode: {} training complete with total mean score of: {}".format(
num_episode, mean_rewards))
break
observation = env.reset()
reward_sum = 0
num_step = 0
while num_step < MAX_LEN_EPISODE:
if RENDER:
env.render()
observation = np.reshape(observation, [1,-1])
policy = sess.run(output, feed_dict={input_x: observation})
action = 0 if policy > 0.5 else 1
observation, reward, done, _ = env.step(action)
reward_sum += reward
if done:
print("Total score: {}".format(reward_sum))
break
env.render(close=True)
env.close()
env = env.env.env
if UPLOAD:
gym.upload("./videos/",api_key=api_key) #you'll need me later
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment