Skip to content

Instantly share code, notes, and snippets.

@nevkontakte
Created January 3, 2020 20:34
Show Gist options
  • Save nevkontakte/beb59f29e0a8152d99003852887e7de7 to your computer and use it in GitHub Desktop.
Save nevkontakte/beb59f29e0a8152d99003852887e7de7 to your computer and use it in GitHub Desktop.
# This is a higher-level Keras implementation of mountain_car_v2_tf_modernized.
#
# Algorithm: TD Advantage Actor-Critic.
#
from sklearn import preprocessing
from tensorflow import keras
from tensorflow.keras import layers
import gym # requires OpenAI gym installed
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
import time
tf.keras.backend.clear_session() # This would be mainly useful in notebooks.
def value_model(inputs: keras.Input) -> keras.Model:
n_hidden1 = 400
n_hidden2 = 400
n_outputs = 1
init_xavier = tf.initializers.glorot_uniform()
hidden1 = layers.Dense(n_hidden1, activation='elu', kernel_initializer=init_xavier, name='hidden1')(inputs)
hidden2 = layers.Dense(n_hidden2, activation='elu', kernel_initializer=init_xavier, name='hidden2')(hidden1)
value_output = layers.Dense(n_outputs, activation=None, kernel_initializer=init_xavier, name='value_output')(
hidden2)
model = keras.Model(name='value_model', inputs=inputs, outputs=value_output)
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()
return model
def policy_model(inputs: keras.Input, env: gym.Env) -> keras.Model:
n_hidden1 = 40
n_hidden2 = 40
n_outputs = 1
init_xavier = tf.initializers.glorot_uniform()
hidden1 = layers.Dense(n_hidden1, activation='elu', kernel_initializer=init_xavier)(inputs)
hidden2 = layers.Dense(n_hidden2, activation='elu', kernel_initializer=init_xavier)(hidden1)
mu = layers.Dense(n_outputs, activation=None, kernel_initializer=init_xavier)(hidden2)
sigma = layers.Dense(n_outputs, activation=None, kernel_initializer=init_xavier)(hidden2)
sigma = tf.nn.softplus(sigma) + 1e-5
# Use https://www.tensorflow.org/probability/api_docs/python/tfp/layers/IndependentNormal
norm_dist = tfp.distributions.Normal(mu, sigma)
action_tf_var = tf.squeeze(norm_dist.sample(1), axis=0)
action_tf_var = tf.clip_by_value(
action_tf_var, env.action_space.low[0],
env.action_space.high[0])
def _loss(y_true, y_pred):
pass # Not sure what's here...
model = keras.Model(name='policy_model', inputs=inputs, outputs=[action_tf_var])
model.summary()
return model
gamma = 0.99
def play_episode(env: gym.Env, scaler: callable, actor: keras.Model, critic: keras.Model):
state = env.reset()
done = False
for t in range(0, 20):
action = actor.predict(scaler(state))
next_state, reward, done, _ = env.step(np.squeeze(action, axis=0))
env.render()
print(state, next_state, reward, done)
value_of_next_state = critic.predict(scaler(next_state))
value_of_current_state = critic.predict(scaler(state))
target = reward + gamma * np.squeeze(value_of_next_state)
advantage = target - value_of_current_state
state = next_state
if done:
break
time.sleep(3)
def main():
env = gym.envs.make("MountainCarContinuous-v0")
state_space_samples = np.array(
[env.observation_space.sample() for x in range(10000)])
scaler = preprocessing.StandardScaler()
scaler.fit(state_space_samples)
# function to normalize states
def scale_state(state): # requires input shape=(2,)
scaled = scaler.transform([state])
return scaled # returns shape =(1,2)
inputs = keras.Input(shape=env.observation_space.shape)
critic = value_model(inputs)
actor = policy_model(inputs, env)
play_episode(env, scale_state, actor, critic)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment