Created
December 15, 2017 18:16
-
-
Save stmobo/219df763cb5a817c1c11dc9168490df3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import numpy as np | |
import tensorflow as tf | |
import keras | |
import keras.backend as K | |
from keras.models import Sequential | |
from keras.initializers import lecun_normal | |
from keras.layers import Dense, AlphaDropout | |
import gym | |
env = gym.make('CartPole-v1') | |
# self-normalizing neural net: | |
# dense layers use SELU activation function and LeCun-normal initialization | |
# also use AlphaDropout at rate=0.1 | |
output_dim = 2 | |
K.set_learning_phase(True) | |
model = keras.Sequential([ | |
Dense(128, input_shape=env.observation_space.shape, activation='selu'), | |
AlphaDropout(0.1), | |
Dense(128, activation='selu'), | |
AlphaDropout(0.1), | |
Dense(128, activation='selu'), | |
AlphaDropout(0.1), | |
Dense(output_dim, activation='softmax'), | |
]) | |
# Loss function needs to calculate gradient of | |
# log(action_prob) * advantage | |
action_onehot = K.placeholder( | |
shape=(None, output_dim), | |
name='action_onehot' | |
) | |
discounted_reward = K.placeholder( | |
shape=(None,), | |
name='discounted_reward' | |
) | |
action_prob = K.sum(model.output * action_onehot, axis=1) | |
log_action_prob = K.log(action_prob) | |
loss = K.mean(log_action_prob * discounted_reward) | |
optimizer = keras.optimizers.Adam() | |
updates = optimizer.get_updates( | |
params=model.trainable_weights, | |
loss=loss | |
) | |
train_fn = K.function( | |
inputs=[model.input, action_onehot, discounted_reward], | |
outputs=[], | |
updates=updates | |
) | |
# begin training: | |
i_episode = 0 | |
ep_rewards = [] | |
while True: | |
i_episode += 1 | |
# run episode: | |
r_discounted = 0 | |
r_raw = 0 | |
obs = env.reset() | |
t = 0 | |
states = [] | |
actions = [] | |
rewards = [] | |
while True: | |
if i_episode % 50 == 0: | |
env.render() | |
# get action: | |
p_action = np.squeeze(model.predict(np.expand_dims(obs, axis=0))) | |
action = np.random.choice(2, p=p_action) | |
a_onehot = np.zeros(2) | |
a_onehot[action] = 1 | |
states.append(obs) | |
actions.append(a_onehot) | |
obs, r, done, info = env.step(action) | |
t += 1 | |
r_discounted += (r * (0.999 ** t)) | |
r_raw += r | |
rewards.append(r_discounted) | |
if done: | |
break | |
rewards = np.array(rewards) | |
rewards -= np.mean(rewards) | |
rewards /= np.std(rewards) | |
ep_rewards.append(r_raw) | |
if len(ep_rewards) > 100: | |
ep_rewards = ep_rewards[-100:] | |
print("i={} t={:.3f} R={:.3f} Ravg={:.3f}".format( | |
i_episode, t, r_discounted, np.mean(ep_rewards) | |
)) | |
if np.mean(ep_rewards) >= 475.0: | |
print("== ENVIRONMENT SOLVED ==") | |
break | |
# train: | |
train_fn([ | |
np.array(states), | |
np.array(actions), | |
rewards | |
]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment