Created
May 22, 2019 11:09
-
-
Save JakubMifek/76309dcfda263ebe95eaa478931f090e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# | |
# All team solutions **must** list **all** members of the team. | |
# The members must be listed using their ReCodEx IDs anywhere | |
# in a comment block in the source file (on a line beginning with `#`). | |
# | |
# You can find out ReCodEx ID in the URL bar after navigating | |
# to your User profile page. The ID has the following format: | |
# 310a5c89-3ea1-11e9-b0fd-00505601122b | |
# 90257956-3ea2-11e9-b0fd-00505601122b | |
# 69bef76d-1ebb-11e8-9de3-00505601122b | |
import numpy as np | |
import time | |
import tensorflow as tf | |
import cart_pole_pixels_evaluator | |
class Network: | |
def __init__(self, env, args): | |
# TODO: Define suitable model, similarly to `reinforce` or `reinforce_with_baseline`. | |
# | |
# Use Adam optimizer with given `args.learning_rate`. | |
input = tf.keras.layers.Input(shape=(80,80,3)) | |
conv = tf.keras.layers.MaxPool2D(4, 2)(input) | |
conv = tf.keras.layers.Conv2D(16,3,2, padding='same')(conv) | |
conv = tf.keras.layers.MaxPool2D(4, 2)(conv) | |
conv = tf.keras.layers.Dropout(0.7)(conv) | |
flatten = tf.keras.layers.Flatten()(conv) | |
dense1 = tf.keras.layers.Dense(64, activation='relu')(flatten) | |
output1 = tf.keras.layers.Dense(env.actions, activation='softmax')(dense1) | |
self.model = tf.keras.Model(inputs=[input], outputs=[output1]) | |
self.model.compile( | |
optimizer=tf.keras.optimizers.Adam(learning_rate=args.learning_rate), | |
loss=tf.keras.losses.SparseCategoricalCrossentropy(), | |
metrics=[ | |
tf.keras.metrics.SparseCategoricalAccuracy() | |
] | |
) | |
def train(self, states, actions, returns): | |
states, actions, returns = np.array(states), np.array(actions), np.array(returns) | |
# TODO: Train the model using the states, actions and observed returns. | |
# - train the policy model, using `returns - predicted_baseline` as weights | |
# in the sparse crossentropy loss | |
self.model.train_on_batch(states, actions, sample_weight=(returns)) | |
# - train the `baseline` model to predict `returns` | |
def predict(self, states): | |
states = np.array(states) | |
# TODO: Predict distribution over actions for the given input states. Return | |
# only the probabilities (if using a baseline). | |
return self.model.predict_on_batch(states) | |
if __name__ == "__main__": | |
# Parse arguments | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--batch_size", default=3, type=int, help="Number of episodes to train on.") | |
parser.add_argument("--episodes", default=2000, type=int, help="Training episodes.") | |
parser.add_argument("--learning_rate", default=0.002, type=float, help="Learning rate.") | |
parser.add_argument("--render_each", default=0, type=int, help="Render some episodes.") | |
parser.add_argument("--threads", default=4, type=int, help="Maximum number of threads to use.") | |
args = parser.parse_args() | |
# Fix random seed | |
seed = 12345 | |
np.random.seed(seed) | |
tf.random.set_seed(seed) | |
tf.config.threading.set_inter_op_parallelism_threads(args.threads) | |
tf.config.threading.set_intra_op_parallelism_threads(args.threads) | |
# Create the environment | |
env = cart_pole_pixels_evaluator.environment() | |
# Construct the network | |
network = Network(env, args) | |
print(time.ctime(time.time())) | |
# Training | |
for _ in range(args.episodes // args.batch_size): | |
if len(env._episode_returns) > 0 and np.mean(env._episode_returns[-100:]) >= 250: | |
break | |
batch_states, batch_actions, batch_returns = [], [], [] | |
for _ in range(args.batch_size): | |
# Perform episode | |
states, actions, rewards = [], [], [] | |
state, done = env.reset(), False | |
while not done: | |
if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: | |
env.render() | |
probabilities = network.predict([state])[0] | |
# TODO(reinforce): Compute `action` according to the distribution returned by the network. | |
# The `np.random.choice` method comes handy. | |
action = np.random.choice(env.actions, p=probabilities) | |
next_state, reward, done, _ = env.step(action) | |
states.append(state) | |
actions.append(action) | |
rewards.append(reward) | |
state = next_state | |
# TODO(reinforce): Compute `returns` from the observed `rewards`. | |
returns = [0] | |
for reward in reversed(rewards): | |
returns.append(reward + returns[-1]) | |
returns = list(reversed(returns)) | |
returns.pop() | |
batch_states += states | |
batch_actions += actions | |
batch_returns += returns | |
network.train(batch_states, batch_actions, batch_returns) | |
print(time.ctime(time.time())) | |
# Final evaluation | |
while True: | |
print(time.ctime(time.time())) | |
state, done = env.reset(True), False | |
while not done: | |
probabilities = network.predict([state])[0] | |
action = np.argmax(probabilities) | |
state, reward, done, _ = env.step(action) | |
print(time.ctime(time.time())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment