Skip to content

Instantly share code, notes, and snippets.

@marekgalovic
Last active October 26, 2017 23:44
Show Gist options
  • Save marekgalovic/d2965086f9779b06d1b9a3630a78cada to your computer and use it in GitHub Desktop.
Save marekgalovic/d2965086f9779b06d1b9a3630a78cada to your computer and use it in GitHub Desktop.
RL
import tensorflow as tf
import numpy as np
import gym
import random
env = gym.make('CartPole-v0')
print('State space:', env.observation_space)
print('Action space:', env.action_space)
N_EPISODES = 2000
STATE_SIZE = len(env.observation_space.low)
ACTION_SIZE = env.action_space.n
UPDATE_FREQUENCY = 3
BATCH_SIZE = 128
LEARNING_RATE = 0.15
GAMMA = 0.98
def discounted_rewards(rewards):
result = np.zeros(len(rewards))
running_sum = 0
for t in reversed(range(len(rewards))):
running_sum = GAMMA*running_sum + rewards[t]
result[t] = running_sum
return result
with tf.Graph().as_default():
# Placeholders
state_ph = tf.placeholder(tf.float32, [None, STATE_SIZE])
# Weights
W = tf.Variable(
tf.random_uniform([STATE_SIZE,ACTION_SIZE], minval=.5, maxval=1.5),
trainable=True
)
# Predict
action_p = tf.nn.softmax(tf.matmul(state_ph, W))
reward_ph = tf.placeholder(tf.float32, [None])
selected_action_ph = tf.placeholder(tf.int32, [None])
# Loss
action_indices = tf.concat([
tf.expand_dims(tf.range(tf.shape(action_p)[0]), -1),
tf.expand_dims(selected_action_ph, -1)
], 1)
responsible_actions = tf.gather_nd(action_p, action_indices)
loss = -tf.reduce_mean(tf.log(responsible_actions) * reward_ph)
# Update
optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
train_op = optimizer.minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
total_rewards = []
samples_buffer = []
try:
for e in range(N_EPISODES):
state = env.reset()
e_states, e_actions, e_rewards = [], [], []
while True:
# env.render()
action_p_val = sess.run(action_p, feed_dict={state_ph: state.reshape(1, -1)})
action = np.random.choice(list(range(len(action_p_val[0]))), p=action_p_val[0])
new_state, reward, done, _ = env.step(action)
e_states.append(state)
e_actions.append(action)
e_rewards.append(reward)
state = new_state
if done:
break
running_rewards = discounted_rewards(e_rewards)
samples_buffer.extend(zip(e_states,e_actions,running_rewards))
# print('Episode:', e, 'Total reward:', sum(e_rewards))
total_rewards.append(sum(e_rewards))
if e % UPDATE_FREQUENCY == 0:
random.shuffle(samples_buffer)
states, actions, rewards = zip(*samples_buffer[:BATCH_SIZE])
samples_buffer = []
_, _loss = sess.run([train_op, loss], feed_dict={
state_ph: np.vstack(states),
selected_action_ph: np.array(actions),
reward_ph: np.array(rewards)
})
if e % 100 == 0:
print('Reward (%d):' % (e), np.mean(total_rewards[-100:]))
except KeyboardInterrupt:
print('Training interrupted.')
env._max_episode_steps = 10000
state = env.reset()
while True:
env.render()
action_p_val = sess.run(action_p, feed_dict={state_ph: state.reshape(1, -1)})
new_state, _, done, _ = env.step(np.argmax(action_p_val[0]))
state = new_state
if done:
break
# Q-network implementation using linear function aproximator.
# Estimates action-value given state and weights matrix Q(a|s,W)
import gym
import numpy as np
import matplotlib.pyplot as plt
import time
env = gym.make('FrozenLake-v0')
print('Observation space:', env.observation_space)
print('Action space:', env.action_space)
N_STATES, N_ACTIONS = env.observation_space.n, env.action_space.n
# Hyper-params
LEARNING_RATE = 0.15
GAMMA = 0.99
N_EPISODES = 2000
EPS_DECAY = 0.02
W = np.random.uniform(low=.0, high=.1, size=(N_STATES, N_ACTIONS))
def one_hot(idx, size):
oh = np.zeros(size)
oh[idx] = 1
return oh
def predict(state):
return np.matmul(one_hot(state, N_STATES).reshape(1, N_STATES), W)
def loss(target, predicted):
return np.sum(np.power(target - predicted, 2))
def gradient(state, predicted, target):
return LEARNING_RATE * np.matmul(one_hot(state, N_STATES).reshape(N_STATES, 1), 2*(target - predicted))
def e_greedy(action_probas, eps):
if np.random.rand(1) < eps:
return env.action_space.sample()
return np.argmax(action_probas)
rewards, epsilons, losses = [], [], []
for e in range(N_EPISODES):
eps = 1. / (1 + e*EPS_DECAY)
epsilons.append(eps)
state = env.reset()
i, total_reward, mean_loss = 0, 0, 0
while True:
i += 1
action_p = predict(state)
action = e_greedy(action_p[0], eps)
new_state, reward, done, _ = env.step(action)
# Target vector
target = action_p.copy()
target[0, action] = reward + GAMMA*np.max(predict(new_state)[0])
# Update weights
l = loss(target, action_p)
W += gradient(state, action_p, target)
# Metrics
total_reward += reward
mean_loss += (1.0/i)*(l - mean_loss)
state = new_state
if done:
break
rewards.append(total_reward)
losses.append(mean_loss)
print('Success rate:', np.mean(rewards))
plt.plot(rewards, label='Episode reward')
plt.plot(losses, label='Loss (MSE)')
plt.plot(epsilons, label='Eps')
plt.legend(loc='upper right')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment