Skip to content

Instantly share code, notes, and snippets.

@ricrogz
Created August 10, 2016 12:53
Show Gist options
  • Save ricrogz/be377caf8f033a0f7acb48cb083be64c to your computer and use it in GitHub Desktop.
Save ricrogz/be377caf8f033a0f7acb48cb083be64c to your computer and use it in GitHub Desktop.
OpenAI -- CartPole-v0
#!/usr/bin/env python2
"""
I am new both to OpenAI and AI itself.
I have to (shamelessly) admit that I have just taken the coded uploaded by 'shanest'
from his write up (https://gym.openai.com/evaluations/eval_RE77KlTNTvCGzTgtm0Qqg),
studied it, and lightly tweaked it with my own ideas.
You know, first step to learning is imitation. Next time I'll upload a solution of my own.
"""
import gym
import numpy as np
import tensorflow as tf
goalruns = 500 # Successful runs after which to exit.
runlimit = 200 # number of steps after which simulation will be cut.
# Also used for statistics.
thr = 195. * 2 # 'Success threshold': points awarded not only over time,
# but also from position and 'surviving' for over 'runlimit' steps.
hidden_layers = [16] # Hidden layer structure (yes, I tried more than one hidden layer)
# Wether to render images and/or monitor exection.
RENDER = False
MONITOR = True
# Monitor & model saving paths and filenames.
filepath = './my/'
modelfile = 'model-rev1.ckpt'
ckptfile = 'ckpt-rev1'
monitorfile = 'monitor-cartpole-v0'
######################################
class PolicyGradientAgent(object):
def __init__(self, hparams, sess):
# initialization
self._s = sess
# build the graph
self._input = tf.placeholder(tf.float32,
shape=[None, hparams['input_size']])
hidden = []
for i in xrange(len(hparams['hidden_size'])):
if i == 0:
layer_inputs = self._input
else:
layer_inputs = hidden[-1]
hidden.append(tf.contrib.layers.fully_connected(
inputs=layer_inputs,
num_outputs=hparams['hidden_size'][i],
activation_fn=tf.nn.relu,
weights_initializer=tf.random_normal)
)
logits = tf.contrib.layers.fully_connected(
inputs=hidden[-1],
num_outputs=hparams['num_actions'],
activation_fn=None)
# Predict ###################
# get log probabilities
log_prob = tf.log(tf.nn.softmax(logits))
# op to sample an action
self._sample = tf.reshape(tf.multinomial(log_prob, 1), [])
# Train #####################
# training part of graph
self._acts = tf.placeholder(tf.int32)
self._advantages = tf.placeholder(tf.float32)
# get log probs of actions from episode
indices = tf.range(0, tf.shape(log_prob)[0]) * tf.shape(log_prob)[1] + self._acts
act_prob = tf.gather(tf.reshape(log_prob, [-1]), indices)
# surrogate loss
loss = -tf.reduce_sum(tf.mul(act_prob, self._advantages))
# update
optimizer = tf.train.RMSPropOptimizer(hparams['learning_rate'])
self._train = optimizer.minimize(loss)
# prepare saver ####
self.saver = tf.train.Saver()
def act(self, observation):
# get one action, by sampling
return self._s.run(self._sample, feed_dict={self._input: [observation]})
def train_step(self, obs, acts, advantages):
batch_feed = {self._input: obs,
self._acts: acts,
self._advantages: advantages}
self._s.run(self._train, feed_dict=batch_feed)
def policy_rollout(env, agent, limit):
"""Run one episode."""
observation, reward, done = env.reset(), 0, False
obs, acts, rews = [], [], []
c = 0
while not done and c < limit:
c += 1
if RENDER:
env.render()
obs.append(observation)
action = agent.act(observation)
observation, reward, done, _ = env.step(action)
acts.append(action)
rews.append(reward)
# give prize if we exited due to boredom
bonus = 0.
if not done:
bonus += 1
if abs(observation[-1]) < 1.:
bonus += sum([1. for o in obs if abs(o[0]) <= 0.5]) / len(obs)
return obs, acts, rews, bonus
def process_rewards(rews, obs, env, bonus):
"""Rewards -> Advantages for one episode. """
i = 2 # 0 == x ; 2 == angle
dev2 = sum([abs(o[i]) for o in obs]) / env.observation_space.high[i]
i = 0
dev0 = sum([abs(o[i]) for o in obs]) / env.observation_space.high[i]
# total reward: length of episode
return [len(rews) * (1. + bonus) - dev0] * len(rews)
# return [len(rews)] * len(rews)
def main():
env = gym.make('CartPole-v0')
if MONITOR:
env.monitor.start(filepath + monitorfile, force=True)
# hyper parameters
hparams = {
'input_size': env.observation_space.shape[0],
'hidden_size': hidden_layers,
'num_actions': env.action_space.n,
'learning_rate': 0.1
}
# environment params
eparams = {
'num_batches': 100,
'ep_per_batch': 10
}
streak = 0
avg_score = []
with tf.Graph().as_default(), tf.Session() as sess:
agent = PolicyGradientAgent(hparams, sess)
sess.run(tf.initialize_all_variables())
ckpt = tf.train.get_checkpoint_state(filepath, latest_filename=ckptfile)
if ckpt and ckpt.model_checkpoint_path:
agent.saver.restore(sess, ckpt.model_checkpoint_path)
counter = 0
n_obs, n_acts, n_rews = [], [], []
for batch in xrange(eparams['num_batches']):
print '=====\nBATCH {}\n===='.format(batch)
b_obs, b_acts, b_rews = n_obs, n_acts, n_rews
n_obs, n_acts, n_rews = [], [], []
for _ in xrange(eparams['ep_per_batch']):
obs, acts, rews, bonus = policy_rollout(env, agent, 200)
counter += 1
b_obs.extend(obs)
n_obs.extend(obs)
b_acts.extend(acts)
n_acts.extend(acts)
advantages = process_rewards(rews, obs, env, bonus)
b_rews.extend(advantages)
n_rews.extend(advantages)
if len(obs) >= runlimit and bonus >= 1.:
streak += 1
else:
streak = 0
if len(avg_score) >= goalruns:
avg_score.pop(0)
avg_score.append(advantages[0])
average = float(sum(avg_score))/len(avg_score)
print '{4:6d} Episode steps: {0:4d}; score = {1:7.2f}; in_row = {2:3d}; avg_score = {3:7.2f}; bonus = {5:4.2f}'\
.format(len(obs), advantages[0], streak, average, counter, bonus)
# success!
if streak >= goalruns:
print "Success!! at expt {0}".format(counter - streak)
agent.saver.save(sess, filepath + modelfile, latest_filename=ckptfile)
return
# update policy
# normalize rewards; don't divide by 0
b_rews_n = (b_rews - np.mean(b_rews)) / (np.std(b_rews) + 1e-10)
agent.train_step(b_obs, b_acts, b_rews_n)
agent.saver.save(sess, filepath + modelfile, latest_filename=ckptfile)
if MONITOR:
env.monitor.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment