Created
August 10, 2016 12:53
-
-
Save ricrogz/be377caf8f033a0f7acb48cb083be64c to your computer and use it in GitHub Desktop.
OpenAI -- CartPole-v0
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
""" | |
I am new both to OpenAI and AI itself. | |
I have to (shamelessly) admit that I have just taken the coded uploaded by 'shanest' | |
from his write up (https://gym.openai.com/evaluations/eval_RE77KlTNTvCGzTgtm0Qqg), | |
studied it, and lightly tweaked it with my own ideas. | |
You know, first step to learning is imitation. Next time I'll upload a solution of my own. | |
""" | |
import gym | |
import numpy as np | |
import tensorflow as tf | |
goalruns = 500 # Successful runs after which to exit. | |
runlimit = 200 # number of steps after which simulation will be cut. | |
# Also used for statistics. | |
thr = 195. * 2 # 'Success threshold': points awarded not only over time, | |
# but also from position and 'surviving' for over 'runlimit' steps. | |
hidden_layers = [16] # Hidden layer structure (yes, I tried more than one hidden layer) | |
# Wether to render images and/or monitor exection. | |
RENDER = False | |
MONITOR = True | |
# Monitor & model saving paths and filenames. | |
filepath = './my/' | |
modelfile = 'model-rev1.ckpt' | |
ckptfile = 'ckpt-rev1' | |
monitorfile = 'monitor-cartpole-v0' | |
###################################### | |
class PolicyGradientAgent(object): | |
def __init__(self, hparams, sess): | |
# initialization | |
self._s = sess | |
# build the graph | |
self._input = tf.placeholder(tf.float32, | |
shape=[None, hparams['input_size']]) | |
hidden = [] | |
for i in xrange(len(hparams['hidden_size'])): | |
if i == 0: | |
layer_inputs = self._input | |
else: | |
layer_inputs = hidden[-1] | |
hidden.append(tf.contrib.layers.fully_connected( | |
inputs=layer_inputs, | |
num_outputs=hparams['hidden_size'][i], | |
activation_fn=tf.nn.relu, | |
weights_initializer=tf.random_normal) | |
) | |
logits = tf.contrib.layers.fully_connected( | |
inputs=hidden[-1], | |
num_outputs=hparams['num_actions'], | |
activation_fn=None) | |
# Predict ################### | |
# get log probabilities | |
log_prob = tf.log(tf.nn.softmax(logits)) | |
# op to sample an action | |
self._sample = tf.reshape(tf.multinomial(log_prob, 1), []) | |
# Train ##################### | |
# training part of graph | |
self._acts = tf.placeholder(tf.int32) | |
self._advantages = tf.placeholder(tf.float32) | |
# get log probs of actions from episode | |
indices = tf.range(0, tf.shape(log_prob)[0]) * tf.shape(log_prob)[1] + self._acts | |
act_prob = tf.gather(tf.reshape(log_prob, [-1]), indices) | |
# surrogate loss | |
loss = -tf.reduce_sum(tf.mul(act_prob, self._advantages)) | |
# update | |
optimizer = tf.train.RMSPropOptimizer(hparams['learning_rate']) | |
self._train = optimizer.minimize(loss) | |
# prepare saver #### | |
self.saver = tf.train.Saver() | |
def act(self, observation): | |
# get one action, by sampling | |
return self._s.run(self._sample, feed_dict={self._input: [observation]}) | |
def train_step(self, obs, acts, advantages): | |
batch_feed = {self._input: obs, | |
self._acts: acts, | |
self._advantages: advantages} | |
self._s.run(self._train, feed_dict=batch_feed) | |
def policy_rollout(env, agent, limit): | |
"""Run one episode.""" | |
observation, reward, done = env.reset(), 0, False | |
obs, acts, rews = [], [], [] | |
c = 0 | |
while not done and c < limit: | |
c += 1 | |
if RENDER: | |
env.render() | |
obs.append(observation) | |
action = agent.act(observation) | |
observation, reward, done, _ = env.step(action) | |
acts.append(action) | |
rews.append(reward) | |
# give prize if we exited due to boredom | |
bonus = 0. | |
if not done: | |
bonus += 1 | |
if abs(observation[-1]) < 1.: | |
bonus += sum([1. for o in obs if abs(o[0]) <= 0.5]) / len(obs) | |
return obs, acts, rews, bonus | |
def process_rewards(rews, obs, env, bonus): | |
"""Rewards -> Advantages for one episode. """ | |
i = 2 # 0 == x ; 2 == angle | |
dev2 = sum([abs(o[i]) for o in obs]) / env.observation_space.high[i] | |
i = 0 | |
dev0 = sum([abs(o[i]) for o in obs]) / env.observation_space.high[i] | |
# total reward: length of episode | |
return [len(rews) * (1. + bonus) - dev0] * len(rews) | |
# return [len(rews)] * len(rews) | |
def main(): | |
env = gym.make('CartPole-v0') | |
if MONITOR: | |
env.monitor.start(filepath + monitorfile, force=True) | |
# hyper parameters | |
hparams = { | |
'input_size': env.observation_space.shape[0], | |
'hidden_size': hidden_layers, | |
'num_actions': env.action_space.n, | |
'learning_rate': 0.1 | |
} | |
# environment params | |
eparams = { | |
'num_batches': 100, | |
'ep_per_batch': 10 | |
} | |
streak = 0 | |
avg_score = [] | |
with tf.Graph().as_default(), tf.Session() as sess: | |
agent = PolicyGradientAgent(hparams, sess) | |
sess.run(tf.initialize_all_variables()) | |
ckpt = tf.train.get_checkpoint_state(filepath, latest_filename=ckptfile) | |
if ckpt and ckpt.model_checkpoint_path: | |
agent.saver.restore(sess, ckpt.model_checkpoint_path) | |
counter = 0 | |
n_obs, n_acts, n_rews = [], [], [] | |
for batch in xrange(eparams['num_batches']): | |
print '=====\nBATCH {}\n===='.format(batch) | |
b_obs, b_acts, b_rews = n_obs, n_acts, n_rews | |
n_obs, n_acts, n_rews = [], [], [] | |
for _ in xrange(eparams['ep_per_batch']): | |
obs, acts, rews, bonus = policy_rollout(env, agent, 200) | |
counter += 1 | |
b_obs.extend(obs) | |
n_obs.extend(obs) | |
b_acts.extend(acts) | |
n_acts.extend(acts) | |
advantages = process_rewards(rews, obs, env, bonus) | |
b_rews.extend(advantages) | |
n_rews.extend(advantages) | |
if len(obs) >= runlimit and bonus >= 1.: | |
streak += 1 | |
else: | |
streak = 0 | |
if len(avg_score) >= goalruns: | |
avg_score.pop(0) | |
avg_score.append(advantages[0]) | |
average = float(sum(avg_score))/len(avg_score) | |
print '{4:6d} Episode steps: {0:4d}; score = {1:7.2f}; in_row = {2:3d}; avg_score = {3:7.2f}; bonus = {5:4.2f}'\ | |
.format(len(obs), advantages[0], streak, average, counter, bonus) | |
# success! | |
if streak >= goalruns: | |
print "Success!! at expt {0}".format(counter - streak) | |
agent.saver.save(sess, filepath + modelfile, latest_filename=ckptfile) | |
return | |
# update policy | |
# normalize rewards; don't divide by 0 | |
b_rews_n = (b_rews - np.mean(b_rews)) / (np.std(b_rews) + 1e-10) | |
agent.train_step(b_obs, b_acts, b_rews_n) | |
agent.saver.save(sess, filepath + modelfile, latest_filename=ckptfile) | |
if MONITOR: | |
env.monitor.close() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment