Skip to content

Instantly share code, notes, and snippets.

@warchildmd
Created February 16, 2017 17:04
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save warchildmd/42b49cbdf61c36e02d7b48aa09d23a52 to your computer and use it in GitHub Desktop.
Save warchildmd/42b49cbdf61c36e02d7b48aa09d23a52 to your computer and use it in GitHub Desktop.
import gym
import random
import time
import numpy as np
import tensorflow as tf
from tensorflow.core.framework import summary_pb2
# Summary
run_params = 'truncated_normal_8x92x64x4_012_'
writer = tf.summary.FileWriter('logs/run_' + run_params + str(time.time()))
# Action choosing policy
epsilon = .54
epsilon_decay = .984
# Set learning parameters
lr = .99
num_episodes = 1000
def build_graph(input_size=8,
output_size=4,
hidden_layers=None,
learning_rate=0.012):
x = tf.placeholder(tf.float32, [None, input_size], name='input_placeholder')
y = tf.placeholder(tf.float32, [None, output_size], name='output_placeholder')
if hidden_layers is None:
hidden_layers = []
sizes = [input_size] + hidden_layers
hidden = x
layer_idx = 0
for input, output in zip(sizes[:-1], sizes[1:]):
layer_idx += 1
with tf.variable_scope('hidden' + str(layer_idx)):
W = tf.get_variable('W', [input, output], initializer=tf.truncated_normal_initializer())
b = tf.get_variable('b', [output], initializer=tf.constant_initializer(0.0))
hidden = tf.matmul(hidden, W) + b
hidden = tf.nn.relu(hidden)
tf.summary.histogram("weights", W)
tf.summary.histogram("biases", b)
tf.summary.histogram("activations", hidden)
with tf.variable_scope('softmax'):
W = tf.get_variable('W', [sizes[-1], output_size], initializer=tf.truncated_normal_initializer())
b = tf.get_variable('b', [output_size], initializer=tf.constant_initializer(0.0))
logits = tf.matmul(hidden, W) + b
tf.summary.histogram("weights", W)
tf.summary.histogram("biases", b)
predictions = tf.nn.softmax(logits)
with tf.name_scope('loss'):
total_loss = tf.reduce_mean(tf.square(y - logits))
with tf.name_scope('train'):
train_step = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)
tf.summary.scalar("total_loss", total_loss)
summary = tf.summary.merge_all()
return dict(
x=x,
y=y,
total_loss=total_loss,
train_step=train_step,
rewards=logits,
preds=predictions,
saver=tf.train.Saver(),
summary=summary
)
def multinomial(chances):
x = random.random()
for idx, y in enumerate(chances):
if x <= y:
# print(chances, idx)
return idx
else:
x -= y
env = gym.make('LunarLander-v2')
# Replay Memory
replay_memory = []
with tf.Session() as session:
g = build_graph(hidden_layers=[92, 64])
session.run(tf.global_variables_initializer())
writer.add_graph(session.graph)
for i in range(num_episodes):
epsilon *= epsilon_decay
# Reset environment and get first new observation
state = env.reset()
done = False
total_score = 0
# Play
step = 0
while step < 999 and not done:
step += 1
old_state = state
# Select an action a, without epsilon for now
predictions = session.run([g['preds']], feed_dict={g['x']: [state]})
# Carry out action a with a multinomial probability and observe reward and new state
action = multinomial(predictions[0][0])
if random.random() < epsilon:
action = random.randint(0, 3)
state, reward, done, _ = env.step(action)
env.render()
total_score += reward
if done:
state = None
# Store replay memory
replay_memory.append((old_state, action, reward, state))
# Learn
step = 0
relevant_memory = random.sample(replay_memory, min(len(replay_memory), 100))
batch_inputs = []
batch_outputs = []
for relevant_memory in relevant_memory:
old_state = relevant_memory[0]
action = relevant_memory[1]
reward = relevant_memory[2]
state = relevant_memory[3]
outputs = session.run([g['rewards']], feed_dict={g['x']: [old_state]})[0][0]
observed_reward = reward
if state is not None:
future_rewards = session.run([g['rewards']], feed_dict={g['x']: [state]})
expected_reward = np.amax(future_rewards[0][0])
observed_reward = reward + lr * expected_reward
outputs[action] = observed_reward
batch_inputs.append(old_state)
batch_outputs.append(outputs)
replay_memory = replay_memory[int(len(replay_memory) / 10):]
train_loss, summary, _ = session.run([g['total_loss'], g['summary'], g['train_step']],
feed_dict={g['x']: batch_inputs, g['y']: batch_outputs})
writer.add_summary(summary, i)
value = summary_pb2.Summary.Value(tag="Score", simple_value=total_score)
score_summary = summary_pb2.Summary(value=[value])
writer.add_summary(score_summary, i)
print('Score / loss at episode %d: %f / %f' % (i, total_score, train_loss))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment