Skip to content

Instantly share code, notes, and snippets.

Created November 19, 2016 21:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save parthsharma1996/7f5997e9e6435b9144b3f12a725f6fc5 to your computer and use it in GitHub Desktop.
Save parthsharma1996/7f5997e9e6435b9144b3f12a725f6fc5 to your computer and use it in GitHub Desktop.
modification of the Double Dueling DQN code by arthur juliani to make it work for Pong
#modificaton of the code from
import gym
import numpy as np
import random
import tensorflow as tf
#import matplotlib.pyplot as plt
import scipy.misc
import os
#%matplotlib inline
env = gym.make("Pong-v0")
size = 80
class Qnetwork():
def __init__(self,h_size):
#The network recieves a frame from the game, flattened into an array.
#It then resizes it and processes it through four convolutional layers.
self.scalarInput = tf.placeholder(shape=[None,size**2],dtype=tf.float32)
self.imageIn = tf.reshape(self.scalarInput,shape=[-1,80,80,1])
self.conv1 = tf.contrib.layers.convolution2d( \
inputs=self.imageIn,num_outputs=32,kernel_size=[8,8],stride=[4,4],padding='VALID', biases_initializer=None)
self.conv2 = tf.contrib.layers.convolution2d( \
inputs=self.conv1,num_outputs=64,kernel_size=[3,3],stride=[2,2],padding='VALID', biases_initializer=None)
self.conv3 = tf.contrib.layers.convolution2d( \
inputs=self.conv2,num_outputs=64,kernel_size=[3,3],stride=[1,1],padding='VALID', biases_initializer=None)
self.conv4 = tf.contrib.layers.convolution2d( \
inputs=self.conv3,num_outputs=512,kernel_size=[7,7],stride=[1,1],padding='VALID', biases_initializer=None)
#We take the output from the final convolutional layer and split it into separate advantage and value streams.
self.streamAC,self.streamVC = tf.split(3,2,self.conv4)
self.streamA = tf.contrib.layers.flatten(self.streamAC)
self.streamV = tf.contrib.layers.flatten(self.streamVC)
self.AW = tf.Variable(tf.random_normal([h_size//2,numActions]))
self.VW = tf.Variable(tf.random_normal([h_size//2,1]))
self.Advantage = tf.matmul(self.streamA,self.AW)
self.Value = tf.matmul(self.streamV,self.VW)
#Then combine them together to get our final Q-values.
self.Qout = self.Value + tf.sub(self.Advantage,tf.reduce_mean(self.Advantage,reduction_indices=1,keep_dims=True))
self.predict = tf.argmax(self.Qout,1)
#Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
self.targetQ = tf.placeholder(shape=[None],dtype=tf.float32)
self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
self.actions_onehot = tf.one_hot(self.actions,numActions,dtype=tf.float32)
self.Q = tf.reduce_sum(tf.mul(self.Qout, self.actions_onehot), reduction_indices=1)
self.td_error = tf.square(self.targetQ - self.Q)
self.loss = tf.reduce_mean(self.td_error)
self.trainer = tf.train.AdamOptimizer(learning_rate=0.01)
self.updateModel = self.trainer.minimize(self.loss)
class experience_buffer():
def __init__(self, buffer_size = 50000):
self.buffer = []
self.buffer_size = buffer_size
def add(self,experience):
if len(self.buffer) + len(experience) >= self.buffer_size:
self.buffer[0:(len(experience)+len(self.buffer))-self.buffer_size] = []
def sample(self,size):
return np.reshape(np.array(random.sample(self.buffer,size)),[size,5])
def preProcess(obs):
obs = obs[35:195]
obs = obs[::2, ::2, 0]
obs[obs == 144] = 0
obs[obs == 109] = 0
obs[obs!=0] = 1
return np.reshape(obs.astype(np.float).ravel(), (1,size*size))
def updateTargetGraph(tfVars,tau):
total_vars = len(tfVars)
op_holder = []
for idx,var in enumerate(tfVars[0:total_vars//2]):
op_holder.append(tfVars[idx+total_vars//2].assign((var.value()*tau) + ((1-tau)*tfVars[idx+total_vars//2].value())))
return op_holder
def updateTarget(op_holder,sess):
for op in op_holder:
batch_size = 32 #How many experiences to use for each training step.
update_freq = 4 #How often to perform a training step.
y = .99 #Discount factor on the target Q-values
startE = 1 #Starting chance of random action
endE = 0.1 #Final chance of random action
anneling_steps = 10000. #How many steps of training to reduce startE to endE.
num_episodes = 10000 #How many episodes of game environment to train network with.
pre_train_steps = 10000 #How many steps of random actions before training begins.
max_epLength = 500000000000 #The max allowed length of our episode.
load_model = False #Whether to load a saved model.
path = "./dqn" #The path to save our model to.
h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams.
tau = 0.1 #Rate to update target network toward primary network
mainQN = Qnetwork(h_size)
targetQN = Qnetwork(h_size)
init = tf.initialize_all_variables()
saver = tf.train.Saver()
trainables = tf.trainable_variables()
targetOps = updateTargetGraph(trainables,tau)
myBuffer = experience_buffer()
#Set the rate of random action decrease.
e = startE
stepDrop = (startE - endE)/anneling_steps
#create lists to contain total rewards and steps per episode
jList = []
rList = []
total_steps = 0
#Make a path for our model to be saved in.
if not os.path.exists(path):
with tf.Session() as sess:
if load_model == True:
print('Loading Model...')
ckpt = tf.train.get_checkpoint_state(path)
updateTarget(targetOps,sess) #Set the target network to be equal to the primary network.
for i in range(num_episodes):
episodeBuffer = experience_buffer()
#Reset environment and get first new observation
s = env.reset()
s = preProcess(s)
d = False
rAll = 0
j = 0
#The Q-Network
while j < max_epLength: #maxeplength is ufficently high
curObs = s
modObs = curObs - prevObs
#Choose an action by greedily (with e chance of random action) from the Q-network
if np.random.rand(1) < e or total_steps < pre_train_steps:
a = np.random.randint(0,6)
#print("a was chosen as ",a,"i =",i,"j = ",j,sep=" ")
#print("a was chosen as ",a,"i =",i,"j = ",j,sep=" ")
s1,r,d,l = env.step(a)
s1 = preProcess(s1)
prevObs = s
total_steps += 1
episodeBuffer.add(np.reshape(np.array([modObs,a,r,s1-prevObs,d]),[1,5])) #Save the experience to our episode buffer.
if total_steps > pre_train_steps:
if e > endE:
# print("Decreasing e to ",e)
e -= stepDrop
if total_steps % (update_freq) == 0:
trainBatch = myBuffer.sample(batch_size) #Get a random batch of experiences.
#Below we perform the Double-DQN update to the target Q-values
Q1 =,feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,3])})
Q2 =,feed_dict={targetQN.scalarInput:np.vstack(trainBatch[:,3])})
end_multiplier = -(trainBatch[:,4] - 1)
doubleQ = Q2[range(batch_size),Q1]
targetQ = trainBatch[:,2] + (y*doubleQ * end_multiplier)
#Update the network with our target values.
_ =,
feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,0]),mainQN.targetQ:targetQ, mainQN.actions:trainBatch[:,1]})
updateTarget(targetOps,sess) #Set the target network to be equal to the primary network.
rAll += r
s = s1
if d == True:
print(i,j,total_steps,rAll, e)
#Get all experiences from this episode and discount their rewards.
#Periodically save the model.
if i % 1000 == 0:,path+'/model-'+str(i)+'.cptk')
print("Saved Model"),path+'/model-'+str(i)+'.cptk')
print("Percent of succesful episodes: " + str(sum(rList)//num_episodes) + "%")
rMat = np.resize(np.array(rList),[len(rList)//100,100])
rMean = np.average(rMat,1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment