Skip to content

Instantly share code, notes, and snippets.

@bkozyrskiy
Created January 26, 2017 18:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bkozyrskiy/14212daaf0438c8bd15c1f85de9ced6b to your computer and use it in GitHub Desktop.
Save bkozyrskiy/14212daaf0438c8bd15c1f85de9ced6b to your computer and use it in GitHub Desktop.
import tensorflow as tf
class Qnetwork():
def __init__(self, h_size, number_of_actions=3):
# The network recieves a frame from the game, flattened into an array.
# It then resizes it and processes it through four convolutional layers.
self.scalarInput = tf.placeholder(shape=[None, 7056], dtype=tf.float32)
self.imageIn = tf.reshape(self.scalarInput, shape=[-1, 84, 84, 1])
self.conv1 = tf.contrib.layers.convolution2d( \
inputs=self.imageIn, num_outputs=32, kernel_size=[8, 8], stride=[4, 4], padding='VALID',
biases_initializer=None)
self.conv2 = tf.contrib.layers.convolution2d( \
inputs=self.conv1, num_outputs=64, kernel_size=[4, 4], stride=[2, 2], padding='VALID',
biases_initializer=None)
self.conv3 = tf.contrib.layers.convolution2d( \
inputs=self.conv2, num_outputs=64, kernel_size=[3, 3], stride=[1, 1], padding='VALID',
biases_initializer=None)
self.conv4 = tf.contrib.layers.convolution2d( \
inputs=self.conv3, num_outputs=h_size, kernel_size=[7, 7], stride=[1, 1], padding='VALID',
biases_initializer=None)
# We take the output from the final convolutional layer and split it into separate advantage and value streams.
self.streamAC, self.streamVC = tf.split(3, 2, self.conv4)
self.streamA = tf.contrib.layers.flatten(self.streamAC)
self.streamV = tf.contrib.layers.flatten(self.streamVC)
self.AW = tf.Variable(tf.random_normal([h_size / 2, number_of_actions]))
self.VW = tf.Variable(tf.random_normal([h_size / 2, 1]))
self.Advantage = tf.matmul(self.streamA, self.AW)
self.Value = tf.matmul(self.streamV, self.VW)
# Then combine them together to get our final Q-values.
self.Qout = self.Value + tf.sub(self.Advantage,
tf.reduce_mean(self.Advantage, reduction_indices=1, keep_dims=True))
self.predict = tf.argmax(self.Qout, 1)
# Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
self.targetQ = tf.placeholder(shape=[None], dtype=tf.float32)
self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
self.actions_onehot = tf.one_hot(self.actions, number_of_actions, dtype=tf.float32)
self.Q = tf.reduce_sum(tf.mul(self.Qout, self.actions_onehot), reduction_indices=1)
self.td_error = tf.square(self.targetQ - self.Q)
self.loss = tf.reduce_mean(self.td_error)
self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001)
self.updateModel = self.trainer.minimize(self.loss)
import numpy as np
import random
class experience_buffer():
def __init__(self, buffer_size=50000):
self.buffer = []
self.buffer_size = buffer_size
def add(self, experience):
if len(self.buffer) + len(experience) >= self.buffer_size:
self.buffer[0:(len(experience) + len(self.buffer)) - self.buffer_size] = []
self.buffer.extend(experience)
def sample(self, size):
return np.reshape(np.array(random.sample(self.buffer, size)), [size, 5])
import gym
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import tensorflow as tf
from experience_buffer import experience_buffer
from dqn import Qnetwork
import os
from skiing import skiing
def processState(states):
return np.reshape(states,[7056])
def updateTargetGraph(tfVars,tau):
total_vars = len(tfVars)
op_holder = []
for idx,var in enumerate(tfVars[0:total_vars/2]):
op_holder.append(tfVars[idx+total_vars/2].assign((var.value()*tau) + ((1-tau)*tfVars[idx+total_vars/2].value())))
return op_holder
def updateTarget(op_holder,sess):
for op in op_holder:
sess.run(op)
game = skiing()
batch_size = 32 #How many experiences to use for each training step.
update_freq = 4 #How often to perform a training step.
y = .99 #Discount factor on the target Q-values
startE = 1 #Starting chance of random action
endE = 0.1 #Final chance of random action
anneling_steps = 10000. #How many steps of training to reduce startE to endE.
num_episodes = 10000 #How many episodes of game environment to train network with.
pre_train_steps = 10000 #How many steps of random actions before training begins.
max_epLength = 5000 #The max allowed length of our episode.
load_model = True #Whether to load a saved model.
test_model = True #Exit after "done" flag is True
path = "./dqn" #The path to save our model to.
h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams.
tau = 0.001 #Rate to update target network toward primary network
tf.reset_default_graph()
mainQN = Qnetwork(h_size)
targetQN = Qnetwork(h_size)
init = tf.initialize_all_variables()
saver = tf.train.Saver()
trainables = tf.trainable_variables()
targetOps = updateTargetGraph(trainables, tau)
myBuffer = experience_buffer()
# Set the rate of random action decrease.
e = startE
stepDrop = (startE - endE) / anneling_steps
# create lists to contain total rewards and steps per episode
jList = []
rList = []
total_steps = 0
# Make a path for our model to be saved in.
if not os.path.exists(path):
os.makedirs(path)
with tf.Session() as sess:
if load_model == True:
print 'Loading Model...'
ckpt = tf.train.get_checkpoint_state(path)
saver.restore(sess, ckpt.model_checkpoint_path)
sess.run(init)
updateTarget(targetOps, sess) # Set the target network to be equal to the primary network.
for i in range(num_episodes):
episodeBuffer = experience_buffer()
# Reset environment and get first new observation
s = game.reset()
s = processState(s)
d = False
rAll = 0
j = 0
# The Q-Network
while j < max_epLength: # If the agent takes longer than 200 moves to reach either of the blocks, end the trial.
j += 1
# Choose an action by greedily (with e chance of random action) from the Q-network
if np.random.rand(1) < e or total_steps < pre_train_steps:
a = np.random.randint(0, 3)
else:
a = sess.run(mainQN.predict, feed_dict={mainQN.scalarInput: [s]})[0]
s1, r, d,_ = game.step(a)
game.env.render()
s1 = processState(s1)
total_steps += 1
episodeBuffer.add(
np.reshape(np.array([s, a, r, s1, d]), [1, 5])) # Save the experience to our episode buffer.
if total_steps > pre_train_steps:
if e > endE:
e -= stepDrop
if total_steps % (update_freq) == 0:
trainBatch = myBuffer.sample(batch_size) # Get a random batch of experiences.
# Below we perform the Double-DQN update to the target Q-values
Q1 = sess.run(mainQN.predict, feed_dict={mainQN.scalarInput: np.vstack(trainBatch[:, 3])})
Q2 = sess.run(targetQN.Qout, feed_dict={targetQN.scalarInput: np.vstack(trainBatch[:, 3])})
end_multiplier = -(trainBatch[:, 4] - 1)
doubleQ = Q2[range(batch_size), Q1]
targetQ = trainBatch[:, 2] + (y * doubleQ * end_multiplier)
# Update the network with our target values.
_ = sess.run(mainQN.updateModel, \
feed_dict={mainQN.scalarInput: np.vstack(trainBatch[:, 0]), mainQN.targetQ: targetQ,
mainQN.actions: trainBatch[:, 1]})
updateTarget(targetOps, sess) # Set the target network to be equal to the primary network.
rAll += r
s = s1
if d == True:
if test_model:
game.env.monitor.close()
break
# Get all experiences from this episode and discount their rewards.
myBuffer.add(episodeBuffer.buffer)
jList.append(j)
rList.append(rAll)
# Periodically save the model.
if i % 1000 == 0:
saver.save(sess, path + '/model-' + str(i) + '.cptk')
print "Saved Model"
if len(rList) % 10 == 0:
print total_steps, np.mean(rList[-10:]), e
saver.save(sess, path + '/model-' + str(i) + '.cptk')
print "Percent of succesful episodes: " + str(sum(rList) / num_episodes) + "%"
import gym
import scipy
from gym import wrappers
#def rgb2gray(rgb):
# return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])
def image_cut(img, left_border = 5, right_border= 155, top_border = 65, buttom_border = 190):
return img[top_border:buttom_border,left_border:right_border,-1]
def img_preprocess(img):
img=image_cut(img)
img=scipy.misc.imresize(img, (84,84), interp='nearest')
return img
class skiing():
def __init__(self):
self.env = gym.make('Skiing-v0')
self.env = wrappers.Monitor(self.env, '/tmp/skiing-experiment-0')
def reset(self):
observation = self.env.reset()
return img_preprocess(observation)
def step(self,action):
# a = 0 - go down
# a = 1 - go right
# a = 2 - go left
observation, reward, done, info = self.env.step(action)
return img_preprocess(observation), reward, done, info
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment