Skip to content

Instantly share code, notes, and snippets.

@stmobo
Created December 15, 2017 18:16
Show Gist options
  • Save stmobo/219df763cb5a817c1c11dc9168490df3 to your computer and use it in GitHub Desktop.
Save stmobo/219df763cb5a817c1c11dc9168490df3 to your computer and use it in GitHub Desktop.
import math
import numpy as np
import tensorflow as tf
import keras
import keras.backend as K
from keras.models import Sequential
from keras.initializers import lecun_normal
from keras.layers import Dense, AlphaDropout
import gym
env = gym.make('CartPole-v1')
# self-normalizing neural net:
# dense layers use SELU activation function and LeCun-normal initialization
# also use AlphaDropout at rate=0.1
output_dim = 2
K.set_learning_phase(True)
model = keras.Sequential([
Dense(128, input_shape=env.observation_space.shape, activation='selu'),
AlphaDropout(0.1),
Dense(128, activation='selu'),
AlphaDropout(0.1),
Dense(128, activation='selu'),
AlphaDropout(0.1),
Dense(output_dim, activation='softmax'),
])
# Loss function needs to calculate gradient of
# log(action_prob) * advantage
action_onehot = K.placeholder(
shape=(None, output_dim),
name='action_onehot'
)
discounted_reward = K.placeholder(
shape=(None,),
name='discounted_reward'
)
action_prob = K.sum(model.output * action_onehot, axis=1)
log_action_prob = K.log(action_prob)
loss = K.mean(log_action_prob * discounted_reward)
optimizer = keras.optimizers.Adam()
updates = optimizer.get_updates(
params=model.trainable_weights,
loss=loss
)
train_fn = K.function(
inputs=[model.input, action_onehot, discounted_reward],
outputs=[],
updates=updates
)
# begin training:
i_episode = 0
ep_rewards = []
while True:
i_episode += 1
# run episode:
r_discounted = 0
r_raw = 0
obs = env.reset()
t = 0
states = []
actions = []
rewards = []
while True:
if i_episode % 50 == 0:
env.render()
# get action:
p_action = np.squeeze(model.predict(np.expand_dims(obs, axis=0)))
action = np.random.choice(2, p=p_action)
a_onehot = np.zeros(2)
a_onehot[action] = 1
states.append(obs)
actions.append(a_onehot)
obs, r, done, info = env.step(action)
t += 1
r_discounted += (r * (0.999 ** t))
r_raw += r
rewards.append(r_discounted)
if done:
break
rewards = np.array(rewards)
rewards -= np.mean(rewards)
rewards /= np.std(rewards)
ep_rewards.append(r_raw)
if len(ep_rewards) > 100:
ep_rewards = ep_rewards[-100:]
print("i={} t={:.3f} R={:.3f} Ravg={:.3f}".format(
i_episode, t, r_discounted, np.mean(ep_rewards)
))
if np.mean(ep_rewards) >= 475.0:
print("== ENVIRONMENT SOLVED ==")
break
# train:
train_fn([
np.array(states),
np.array(actions),
rewards
])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment