Skip to content

Instantly share code, notes, and snippets.

@JakubMifek
Created May 22, 2019 11:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JakubMifek/76309dcfda263ebe95eaa478931f090e to your computer and use it in GitHub Desktop.
Save JakubMifek/76309dcfda263ebe95eaa478931f090e to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
#
# All team solutions **must** list **all** members of the team.
# The members must be listed using their ReCodEx IDs anywhere
# in a comment block in the source file (on a line beginning with `#`).
#
# You can find out ReCodEx ID in the URL bar after navigating
# to your User profile page. The ID has the following format:
# 310a5c89-3ea1-11e9-b0fd-00505601122b
# 90257956-3ea2-11e9-b0fd-00505601122b
# 69bef76d-1ebb-11e8-9de3-00505601122b
import numpy as np
import time
import tensorflow as tf
import cart_pole_pixels_evaluator
class Network:
def __init__(self, env, args):
# TODO: Define suitable model, similarly to `reinforce` or `reinforce_with_baseline`.
#
# Use Adam optimizer with given `args.learning_rate`.
input = tf.keras.layers.Input(shape=(80,80,3))
conv = tf.keras.layers.MaxPool2D(4, 2)(input)
conv = tf.keras.layers.Conv2D(16,3,2, padding='same')(conv)
conv = tf.keras.layers.MaxPool2D(4, 2)(conv)
conv = tf.keras.layers.Dropout(0.7)(conv)
flatten = tf.keras.layers.Flatten()(conv)
dense1 = tf.keras.layers.Dense(64, activation='relu')(flatten)
output1 = tf.keras.layers.Dense(env.actions, activation='softmax')(dense1)
self.model = tf.keras.Model(inputs=[input], outputs=[output1])
self.model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=args.learning_rate),
loss=tf.keras.losses.SparseCategoricalCrossentropy(),
metrics=[
tf.keras.metrics.SparseCategoricalAccuracy()
]
)
def train(self, states, actions, returns):
states, actions, returns = np.array(states), np.array(actions), np.array(returns)
# TODO: Train the model using the states, actions and observed returns.
# - train the policy model, using `returns - predicted_baseline` as weights
# in the sparse crossentropy loss
self.model.train_on_batch(states, actions, sample_weight=(returns))
# - train the `baseline` model to predict `returns`
def predict(self, states):
states = np.array(states)
# TODO: Predict distribution over actions for the given input states. Return
# only the probabilities (if using a baseline).
return self.model.predict_on_batch(states)
if __name__ == "__main__":
# Parse arguments
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--batch_size", default=3, type=int, help="Number of episodes to train on.")
parser.add_argument("--episodes", default=2000, type=int, help="Training episodes.")
parser.add_argument("--learning_rate", default=0.002, type=float, help="Learning rate.")
parser.add_argument("--render_each", default=0, type=int, help="Render some episodes.")
parser.add_argument("--threads", default=4, type=int, help="Maximum number of threads to use.")
args = parser.parse_args()
# Fix random seed
seed = 12345
np.random.seed(seed)
tf.random.set_seed(seed)
tf.config.threading.set_inter_op_parallelism_threads(args.threads)
tf.config.threading.set_intra_op_parallelism_threads(args.threads)
# Create the environment
env = cart_pole_pixels_evaluator.environment()
# Construct the network
network = Network(env, args)
print(time.ctime(time.time()))
# Training
for _ in range(args.episodes // args.batch_size):
if len(env._episode_returns) > 0 and np.mean(env._episode_returns[-100:]) >= 250:
break
batch_states, batch_actions, batch_returns = [], [], []
for _ in range(args.batch_size):
# Perform episode
states, actions, rewards = [], [], []
state, done = env.reset(), False
while not done:
if args.render_each and env.episode > 0 and env.episode % args.render_each == 0:
env.render()
probabilities = network.predict([state])[0]
# TODO(reinforce): Compute `action` according to the distribution returned by the network.
# The `np.random.choice` method comes handy.
action = np.random.choice(env.actions, p=probabilities)
next_state, reward, done, _ = env.step(action)
states.append(state)
actions.append(action)
rewards.append(reward)
state = next_state
# TODO(reinforce): Compute `returns` from the observed `rewards`.
returns = [0]
for reward in reversed(rewards):
returns.append(reward + returns[-1])
returns = list(reversed(returns))
returns.pop()
batch_states += states
batch_actions += actions
batch_returns += returns
network.train(batch_states, batch_actions, batch_returns)
print(time.ctime(time.time()))
# Final evaluation
while True:
print(time.ctime(time.time()))
state, done = env.reset(True), False
while not done:
probabilities = network.predict([state])[0]
action = np.argmax(probabilities)
state, reward, done, _ = env.step(action)
print(time.ctime(time.time()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment