breeko/cartpole_policy-based_agent.py

## cartpole_policy-based_agent.py
# Policy based reinforcement learning agent used to solve openai's CartPole challenge
# https://gym.openai.com/evaluations/eval_dMY1xQiST7GXe4Br5n31w

import numpy as np
import tensorflow as tf
import gym

ENVIRONMENT = "CartPole-v0"
SEED = 0

LEARNING_RATE = 1e-2
GAMMA = 0.99
DECAY_RATE = 0.99
BATCH_SIZE = 3
NUM_HIDDEN = 10
NUM_EPISODES = 5000
MAX_LEN_EPISODE = 500
PRINT_EVERY = 100
GOAL_REWARD = 195
GOAL_NUM_EPISODES = 100
RENDER = True # Whether to render after completing
UPLOAD = True # Whether to upload to openai

if UPLOAD:
    from key import api_key

np.random.seed(SEED)
tf.set_random_seed(SEED)

env = gym.make(ENVIRONMENT)
env = gym.wrappers.Monitor(env,directory="videos",force=True)

dimen = env.observation_space.shape[0]

def discount(r, gamma=0.99, standardize=False):
    """" Takes 1-d float array of rewards and computes the discount reward
            e.g. f([1,1,1], 0.99) -> [1, 0.99, 0.9801]
    """
    discounted = np.array([val * (gamma ** i) for i, val in enumerate(r)])
    if standardize:
        discounted -= np.mean(discounted)
        discounted /= np.std(discounted)
    return discounted

# Define neural network
tf.reset_default_graph()

input_x = tf.placeholder(tf.float32, [None, dimen], name="input_x")

# First layer
W1 = tf.get_variable("W1", shape=[dimen,NUM_HIDDEN], initializer=tf.contrib.layers.xavier_initializer())
layer_1 = tf.nn.relu(tf.matmul(input_x, W1))

# Second layer
W2 = tf.get_variable("W2", shape=[NUM_HIDDEN, 1], initializer=tf.contrib.layers.xavier_initializer())
output = tf.nn.sigmoid(tf.matmul(layer_1, W2))

# Placeholders for inputs used in training
input_y = tf.placeholder(tf.float32, shape=[None,1], name="input_y")
advantages = tf.placeholder(tf.float32, shape=[None,1], name="reward_signal")

# Loss function. Equivalent to: 0 if input_y == output else 1
log_lik = tf.log(input_y * (input_y - output) + (1 - input_y) * (input_y + output))
loss = -tf.reduce_mean(log_lik * advantages)

# Gradients
W1_grad = tf.placeholder(tf.float32, name="W1_grad")
W2_grad = tf.placeholder(tf.float32, name="W2_grad")
batch_grad = [W1_grad, W2_grad]
trainable_vars = [W1, W2]

grads = tf.gradients(loss,trainable_vars)

# Optimizer
adam_p = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
update_grads = adam_p.apply_gradients(zip(batch_grad,trainable_vars))

# Initialize and test to see if model is setup correctly
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
random_obs = np.random.random(size=[1,dimen])
random_action = env.action_space.sample()

print("obs: {}\naction: {}\noutput policy: {}".format(
        random_obs,
        random_action,
        sess.run(output,feed_dict={input_x: random_obs})))

cum_rewards =  []

# Setup arrays used to track episode performance
observations = np.empty(0).reshape(0,dimen)
rewards = np.empty(0).reshape(0,1)
actions = np.empty(0).reshape(0,1)

# Setups array used to track gradients
cum_grads = np.array([np.zeros(var.get_shape().as_list()) for var in trainable_vars])

num_episode = 0

observation = env.reset()

while num_episode < NUM_EPISODES:
    observation = observation.reshape(1,-1)

    # Determine policy
    policy = sess.run(output, feed_dict={input_x: observation})

    # Decide on an action based on policy, allowing for some randomness
    action = 0 if policy > np.random.uniform() else 1

    # Keep track of observations and actions
    observations = np.vstack([observations, observation])
    actions = np.vstack([actions, action])

    observation, reward, done, _ = env.step(action)

    rewards = np.vstack([rewards,reward])

    if done or len(observations) > MAX_LEN_EPISODE:

        cum_rewards.append(np.sum(rewards))

        # Discount rewards
        disc_rewards = discount(rewards,standardize=True)

        # Add gradients to running batch
        cum_grads += sess.run(grads, feed_dict={input_x: observations, input_y: actions, advantages: disc_rewards})

        num_episode += 1
        observation = env.reset()

        # Reset everything
        observations = np.empty(0).reshape(0,dimen)
        rewards = np.empty(0).reshape(0,1)
        actions = np.empty(0).reshape(0,1)

        if num_episode % BATCH_SIZE == 0:

            # Update gradients
            sess.run(update_grads, feed_dict={W1_grad: cum_grads[0], W2_grad: cum_grads[1]})

            # Reset gradients
            cum_grads = np.array([np.zeros(var.get_shape().as_list()) for var in trainable_vars])

            mean_rewards = np.mean(cum_rewards[-GOAL_NUM_EPISODES:])

            # Print periodically
            if (num_episode % (BATCH_SIZE * PRINT_EVERY)) == 0:
                print("Episode: {} last batch rewards: {:0.2f}".format(
                    num_episode, mean_rewards))

            # If our score is good enough, stop
            if mean_rewards >= GOAL_REWARD and num_episode >= GOAL_NUM_EPISODES:
                print("Episode: {} training complete with total mean score of: {}".format(
                    num_episode, mean_rewards))
                break

observation = env.reset()
reward_sum = 0
num_step = 0

while num_step < MAX_LEN_EPISODE:
    if RENDER:
        env.render()

    observation = np.reshape(observation, [1,-1])
    policy = sess.run(output, feed_dict={input_x: observation})
    action = 0 if policy > 0.5 else 1
    observation, reward, done, _ = env.step(action)
    reward_sum += reward
    if done:
        print("Total score: {}".format(reward_sum))
        break

env.render(close=True)

env.close()
env = env.env.env
if UPLOAD:
    gym.upload("./videos/",api_key=api_key) #you'll need me later
	# Policy based reinforcement learning agent used to solve openai's CartPole challenge
	# https://gym.openai.com/evaluations/eval_dMY1xQiST7GXe4Br5n31w

	import numpy as np
	import tensorflow as tf
	import gym

	ENVIRONMENT = "CartPole-v0"
	SEED = 0

	LEARNING_RATE = 1e-2
	GAMMA = 0.99
	DECAY_RATE = 0.99
	BATCH_SIZE = 3
	NUM_HIDDEN = 10
	NUM_EPISODES = 5000
	MAX_LEN_EPISODE = 500
	PRINT_EVERY = 100
	GOAL_REWARD = 195
	GOAL_NUM_EPISODES = 100
	RENDER = True # Whether to render after completing
	UPLOAD = True # Whether to upload to openai

	if UPLOAD:
	from key import api_key

	np.random.seed(SEED)
	tf.set_random_seed(SEED)

	env = gym.make(ENVIRONMENT)
	env = gym.wrappers.Monitor(env,directory="videos",force=True)

	dimen = env.observation_space.shape[0]

	def discount(r, gamma=0.99, standardize=False):
	"""" Takes 1-d float array of rewards and computes the discount reward
	e.g. f([1,1,1], 0.99) -> [1, 0.99, 0.9801]
	"""
	discounted = np.array([val * (gamma ** i) for i, val in enumerate(r)])
	if standardize:
	discounted -= np.mean(discounted)
	discounted /= np.std(discounted)
	return discounted

	# Define neural network
	tf.reset_default_graph()

	input_x = tf.placeholder(tf.float32, [None, dimen], name="input_x")

	# First layer
	W1 = tf.get_variable("W1", shape=[dimen,NUM_HIDDEN], initializer=tf.contrib.layers.xavier_initializer())
	layer_1 = tf.nn.relu(tf.matmul(input_x, W1))

	# Second layer
	W2 = tf.get_variable("W2", shape=[NUM_HIDDEN, 1], initializer=tf.contrib.layers.xavier_initializer())
	output = tf.nn.sigmoid(tf.matmul(layer_1, W2))

	# Placeholders for inputs used in training
	input_y = tf.placeholder(tf.float32, shape=[None,1], name="input_y")
	advantages = tf.placeholder(tf.float32, shape=[None,1], name="reward_signal")

	# Loss function. Equivalent to: 0 if input_y == output else 1
	log_lik = tf.log(input_y * (input_y - output) + (1 - input_y) * (input_y + output))
	loss = -tf.reduce_mean(log_lik * advantages)

	# Gradients
	W1_grad = tf.placeholder(tf.float32, name="W1_grad")
	W2_grad = tf.placeholder(tf.float32, name="W2_grad")
	batch_grad = [W1_grad, W2_grad]
	trainable_vars = [W1, W2]

	grads = tf.gradients(loss,trainable_vars)

	# Optimizer
	adam_p = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
	update_grads = adam_p.apply_gradients(zip(batch_grad,trainable_vars))

	# Initialize and test to see if model is setup correctly
	init = tf.global_variables_initializer()
	sess = tf.Session()
	sess.run(init)
	random_obs = np.random.random(size=[1,dimen])
	random_action = env.action_space.sample()

	print("obs: {}\naction: {}\noutput policy: {}".format(
	random_obs,
	random_action,
	sess.run(output,feed_dict={input_x: random_obs})))

	cum_rewards = []

	# Setup arrays used to track episode performance
	observations = np.empty(0).reshape(0,dimen)
	rewards = np.empty(0).reshape(0,1)
	actions = np.empty(0).reshape(0,1)

	# Setups array used to track gradients
	cum_grads = np.array([np.zeros(var.get_shape().as_list()) for var in trainable_vars])

	num_episode = 0

	observation = env.reset()

	while num_episode < NUM_EPISODES:
	observation = observation.reshape(1,-1)

	# Determine policy
	policy = sess.run(output, feed_dict={input_x: observation})

	# Decide on an action based on policy, allowing for some randomness
	action = 0 if policy > np.random.uniform() else 1

	# Keep track of observations and actions
	observations = np.vstack([observations, observation])
	actions = np.vstack([actions, action])

	observation, reward, done, _ = env.step(action)

	rewards = np.vstack([rewards,reward])

	if done or len(observations) > MAX_LEN_EPISODE:

	cum_rewards.append(np.sum(rewards))

	# Discount rewards
	disc_rewards = discount(rewards,standardize=True)

	# Add gradients to running batch
	cum_grads += sess.run(grads, feed_dict={input_x: observations, input_y: actions, advantages: disc_rewards})

	num_episode += 1
	observation = env.reset()

	# Reset everything
	observations = np.empty(0).reshape(0,dimen)
	rewards = np.empty(0).reshape(0,1)
	actions = np.empty(0).reshape(0,1)

	if num_episode % BATCH_SIZE == 0:

	# Update gradients
	sess.run(update_grads, feed_dict={W1_grad: cum_grads[0], W2_grad: cum_grads[1]})

	# Reset gradients
	cum_grads = np.array([np.zeros(var.get_shape().as_list()) for var in trainable_vars])

	mean_rewards = np.mean(cum_rewards[-GOAL_NUM_EPISODES:])

	# Print periodically
	if (num_episode % (BATCH_SIZE * PRINT_EVERY)) == 0:
	print("Episode: {} last batch rewards: {:0.2f}".format(
	num_episode, mean_rewards))

	# If our score is good enough, stop
	if mean_rewards >= GOAL_REWARD and num_episode >= GOAL_NUM_EPISODES:
	print("Episode: {} training complete with total mean score of: {}".format(
	num_episode, mean_rewards))
	break

	observation = env.reset()
	reward_sum = 0
	num_step = 0

	while num_step < MAX_LEN_EPISODE:
	if RENDER:
	env.render()

	observation = np.reshape(observation, [1,-1])
	policy = sess.run(output, feed_dict={input_x: observation})
	action = 0 if policy > 0.5 else 1
	observation, reward, done, _ = env.step(action)
	reward_sum += reward
	if done:
	print("Total score: {}".format(reward_sum))
	break

	env.render(close=True)

	env.close()
	env = env.env.env
	if UPLOAD:
	gym.upload("./videos/",api_key=api_key) #you'll need me later