yoheitaonishi/Deep-Recurrent-Q-Network.ipynb

## Deep-Recurrent-Q-Network.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Deep Recurrent Q-Network \n",
    "This notebook provides an example implementation of a Deep Recurrent Q-Network which can solve Partially Observable Markov Decision Processes. To learn more about DRQNs, see my blog post on them here: https://medium.com/p/68463e9aeefc .\n",
    "\n",
    "For more reinforcment learning tutorials, as well as the additional required `gridworld.py` and `helper.py` see:\n",
    "https://github.com/awjuliani/DeepRL-Agents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import random\n",
    "import tensorflow as tf\n",
    "import matplotlib.pyplot as plt\n",
    "import scipy.misc\n",
    "import os\n",
    "import csv\n",
    "import itertools\n",
    "import tensorflow.contrib.slim as slim\n",
    "%matplotlib inline\n",
    "\n",
    "from helper import *"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load the game environment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from gridworld import gameEnv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Feel free to adjust the size of the gridworld. Making it smaller (adjusting `size`) provides an easier task for our DRQN agent, while making the world larger increases the challenge.\n",
    "\n",
    "Initializing the Gridworld with `True` limits the field of view, resulting in a partially observable MDP. Initializing it with `False` provides the agent with the entire environment, resulting in a fully MDP."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "env = gameEnv(partial=False,size=9)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "env = gameEnv(partial=True,size=9)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Above are examples of a starting environment in our simple game. The agent controls the blue square, and can move up, down, left, or right. The goal is to move to the green squares (for +1 reward) and avoid the red squares (for -1 reward). When the agent moves through a green or red square, it is randomly moved to a new place in the environment."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Implementing the network itself"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "class Qnetwork():\n",
    "    def __init__(self,h_size,rnn_cell,myScope):\n",
    "        #The network recieves a frame from the game, flattened into an array.\n",
    "        #It then resizes it and processes it through four convolutional layers.\n",
    "        self.scalarInput =  tf.placeholder(shape=[None,21168],dtype=tf.float32)\n",
    "        self.imageIn = tf.reshape(self.scalarInput,shape=[-1,84,84,3])\n",
    "        self.conv1 = slim.convolution2d( \\\n",
    "            inputs=self.imageIn,num_outputs=32,\\\n",
    "            kernel_size=[8,8],stride=[4,4],padding='VALID', \\\n",
    "            biases_initializer=None,scope=myScope+'_conv1')\n",
    "        self.conv2 = slim.convolution2d( \\\n",
    "            inputs=self.conv1,num_outputs=64,\\\n",
    "            kernel_size=[4,4],stride=[2,2],padding='VALID', \\\n",
    "            biases_initializer=None,scope=myScope+'_conv2')\n",
    "        self.conv3 = slim.convolution2d( \\\n",
    "            inputs=self.conv2,num_outputs=64,\\\n",
    "            kernel_size=[3,3],stride=[1,1],padding='VALID', \\\n",
    "            biases_initializer=None,scope=myScope+'_conv3')\n",
    "        self.conv4 = slim.convolution2d( \\\n",
    "            inputs=self.conv3,num_outputs=h_size,\\\n",
    "            kernel_size=[7,7],stride=[1,1],padding='VALID', \\\n",
    "            biases_initializer=None,scope=myScope+'_conv4')\n",
    "        \n",
    "        self.trainLength = tf.placeholder(dtype=tf.int32)\n",
    "        #We take the output from the final convolutional layer and send it to a recurrent layer.\n",
    "        #The input must be reshaped into [batch x trace x units] for rnn processing, \n",
    "        #and then returned to [batch x units] when sent through the upper levles.\n",
    "        self.batch_size = tf.placeholder(dtype=tf.int32,shape=[])\n",
    "        self.convFlat = tf.reshape(slim.flatten(self.conv4),[self.batch_size,self.trainLength,h_size])\n",
    "        self.state_in = rnn_cell.zero_state(self.batch_size, tf.float32)\n",
    "        self.rnn,self.rnn_state = tf.nn.dynamic_rnn(\\\n",
    "                inputs=self.convFlat,cell=rnn_cell,dtype=tf.float32,initial_state=self.state_in,scope=myScope+'_rnn')\n",
    "        self.rnn = tf.reshape(self.rnn,shape=[-1,h_size])\n",
    "        #The output from the recurrent player is then split into separate Value and Advantage streams\n",
    "        self.streamA,self.streamV = tf.split(self.rnn,2,1)\n",
    "        self.AW = tf.Variable(tf.random_normal([h_size//2,4]))\n",
    "        self.VW = tf.Variable(tf.random_normal([h_size//2,1]))\n",
    "        self.Advantage = tf.matmul(self.streamA,self.AW)\n",
    "        self.Value = tf.matmul(self.streamV,self.VW)\n",
    "        \n",
    "        self.salience = tf.gradients(self.Advantage,self.imageIn)\n",
    "        #Then combine them together to get our final Q-values.\n",
    "        self.Qout = self.Value + tf.subtract(self.Advantage,tf.reduce_mean(self.Advantage,axis=1,keep_dims=True))\n",
    "        self.predict = tf.argmax(self.Qout,1)\n",
    "        \n",
    "        #Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.\n",
    "        self.targetQ = tf.placeholder(shape=[None],dtype=tf.float32)\n",
    "        self.actions = tf.placeholder(shape=[None],dtype=tf.int32)\n",
    "        self.actions_onehot = tf.one_hot(self.actions,4,dtype=tf.float32)\n",
    "        \n",
    "        self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1)\n",
    "        \n",
    "        self.td_error = tf.square(self.targetQ - self.Q)\n",
    "        \n",
    "        #In order to only propogate accurate gradients through the network, we will mask the first\n",
    "        #half of the losses for each trace as per Lample & Chatlot 2016\n",
    "        self.maskA = tf.zeros([self.batch_size,self.trainLength//2])\n",
    "        self.maskB = tf.ones([self.batch_size,self.trainLength//2])\n",
    "        self.mask = tf.concat([self.maskA,self.maskB],1)\n",
    "        self.mask = tf.reshape(self.mask,[-1])\n",
    "        self.loss = tf.reduce_mean(self.td_error * self.mask)\n",
    "        \n",
    "        self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001)\n",
    "        self.updateModel = self.trainer.minimize(self.loss)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Experience Replay"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "These classes allow us to store experies and sample then randomly to train the network.\n",
    "Episode buffer stores experiences for each individal episode.\n",
    "Experience buffer stores entire episodes of experience, and sample() allows us to get training batches needed from the network."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "class experience_buffer():\n",
    "    def __init__(self, buffer_size = 1000):\n",
    "        self.buffer = []\n",
    "        self.buffer_size = buffer_size\n",
    "    \n",
    "    def add(self,experience):\n",
    "        if len(self.buffer) + 1 >= self.buffer_size:\n",
    "            self.buffer[0:(1+len(self.buffer))-self.buffer_size] = []\n",
    "        self.buffer.append(experience)\n",
    "            \n",
    "    def sample(self,batch_size,trace_length):\n",
    "        sampled_episodes = random.sample(self.buffer,batch_size)\n",
    "        sampledTraces = []\n",
    "        for episode in sampled_episodes:\n",
    "            point = np.random.randint(0,len(episode)+1-trace_length)\n",
    "            sampledTraces.append(episode[point:point+trace_length])\n",
    "        sampledTraces = np.array(sampledTraces)\n",
    "        return np.reshape(sampledTraces,[batch_size*trace_length,5])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Training the network"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Setting the training parameters\n",
    "batch_size = 4 #How many experience traces to use for each training step.\n",
    "trace_length = 8 #How long each experience trace will be when training\n",
    "update_freq = 5 #How often to perform a training step.\n",
    "y = .99 #Discount factor on the target Q-values\n",
    "startE = 1 #Starting chance of random action\n",
    "endE = 0.1 #Final chance of random action\n",
    "anneling_steps = 10000 #How many steps of training to reduce startE to endE.\n",
    "num_episodes = 10000 #How many episodes of game environment to train network with.\n",
    "pre_train_steps = 10000 #How many steps of random actions before training begins.\n",
    "load_model = False #Whether to load a saved model.\n",
    "path = \"./drqn\" #The path to save our model to.\n",
    "h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams.\n",
    "max_epLength = 50 #The max allowed length of our episode.\n",
    "time_per_step = 1 #Length of each step used in gif creation\n",
    "summaryLength = 100 #Number of epidoes to periodically save for analysis\n",
    "tau = 0.001"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "tf.reset_default_graph()\n",
    "#We define the cells for the primary and target q-networks\n",
    "cell = tf.contrib.rnn.BasicLSTMCell(num_units=h_size,state_is_tuple=True)\n",
    "cellT = tf.contrib.rnn.BasicLSTMCell(num_units=h_size,state_is_tuple=True)\n",
    "mainQN = Qnetwork(h_size,cell,'main')\n",
    "targetQN = Qnetwork(h_size,cellT,'target')\n",
    "\n",
    "init = tf.global_variables_initializer()\n",
    "\n",
    "saver = tf.train.Saver(max_to_keep=5)\n",
    "\n",
    "trainables = tf.trainable_variables()\n",
    "\n",
    "targetOps = updateTargetGraph(trainables,tau)\n",
    "\n",
    "myBuffer = experience_buffer()\n",
    "\n",
    "#Set the rate of random action decrease. \n",
    "e = startE\n",
    "stepDrop = (startE - endE)/anneling_steps\n",
    "\n",
    "#create lists to contain total rewards and steps per episode\n",
    "jList = []\n",
    "rList = []\n",
    "total_steps = 0\n",
    "\n",
    "#Make a path for our model to be saved in.\n",
    "if not os.path.exists(path):\n",
    "    os.makedirs(path)\n",
    "\n",
    "##Write the first line of the master log-file for the Control Center\n",
    "with open('./Center/log.csv', 'w') as myfile:\n",
    "    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)\n",
    "    wr.writerow(['Episode','Length','Reward','IMG','LOG','SAL'])    \n",
    "  \n",
    "\n",
    "with tf.Session() as sess:\n",
    "    if load_model == True:\n",
    "        print ('Loading Model...')\n",
    "        ckpt = tf.train.get_checkpoint_state(path)\n",
    "        saver.restore(sess,ckpt.model_checkpoint_path)\n",
    "    sess.run(init)\n",
    "   \n",
    "    updateTarget(targetOps,sess) #Set the target network to be equal to the primary network.\n",
    "    for i in range(num_episodes):\n",
    "        episodeBuffer = []\n",
    "        #Reset environment and get first new observation\n",
    "        sP = env.reset()\n",
    "        s = processState(sP)\n",
    "        d = False\n",
    "        rAll = 0\n",
    "        j = 0\n",
    "        state = (np.zeros([1,h_size]),np.zeros([1,h_size])) #Reset the recurrent layer's hidden state\n",
    "        #The Q-Network\n",
    "        while j < max_epLength: \n",
    "            j+=1\n",
    "            #Choose an action by greedily (with e chance of random action) from the Q-network\n",
    "            if np.random.rand(1) < e or total_steps < pre_train_steps:\n",
    "                state1 = sess.run(mainQN.rnn_state,\\\n",
    "                    feed_dict={mainQN.scalarInput:[s/255.0],mainQN.trainLength:1,mainQN.state_in:state,mainQN.batch_size:1})\n",
    "                a = np.random.randint(0,4)\n",
    "            else:\n",
    "                a, state1 = sess.run([mainQN.predict,mainQN.rnn_state],\\\n",
    "                    feed_dict={mainQN.scalarInput:[s/255.0],mainQN.trainLength:1,mainQN.state_in:state,mainQN.batch_size:1})\n",
    "                a = a[0]\n",
    "            s1P,r,d = env.step(a)\n",
    "            s1 = processState(s1P)\n",
    "            total_steps += 1\n",
    "            episodeBuffer.append(np.reshape(np.array([s,a,r,s1,d]),[1,5]))\n",
    "            if total_steps > pre_train_steps:\n",
    "                if e > endE:\n",
    "                    e -= stepDrop\n",
    "\n",
    "                if total_steps % (update_freq) == 0:\n",
    "                    updateTarget(targetOps,sess)\n",
    "                    #Reset the recurrent layer's hidden state\n",
    "                    state_train = (np.zeros([batch_size,h_size]),np.zeros([batch_size,h_size])) \n",
    "                    \n",
    "                    trainBatch = myBuffer.sample(batch_size,trace_length) #Get a random batch of experiences.\n",
    "                    #Below we perform the Double-DQN update to the target Q-values\n",
    "                    Q1 = sess.run(mainQN.predict,feed_dict={\\\n",
    "                        mainQN.scalarInput:np.vstack(trainBatch[:,3]/255.0),\\\n",
    "                        mainQN.trainLength:trace_length,mainQN.state_in:state_train,mainQN.batch_size:batch_size})\n",
    "                    Q2 = sess.run(targetQN.Qout,feed_dict={\\\n",
    "                        targetQN.scalarInput:np.vstack(trainBatch[:,3]/255.0),\\\n",
    "                        targetQN.trainLength:trace_length,targetQN.state_in:state_train,targetQN.batch_size:batch_size})\n",
    "                    end_multiplier = -(trainBatch[:,4] - 1)\n",
    "                    doubleQ = Q2[range(batch_size*trace_length),Q1]\n",
    "                    targetQ = trainBatch[:,2] + (y*doubleQ * end_multiplier)\n",
    "                    #Update the network with our target values.\n",
    "                    sess.run(mainQN.updateModel, \\\n",
    "                        feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,0]/255.0),mainQN.targetQ:targetQ,\\\n",
    "                        mainQN.actions:trainBatch[:,1],mainQN.trainLength:trace_length,\\\n",
    "                        mainQN.state_in:state_train,mainQN.batch_size:batch_size})\n",
    "            rAll += r\n",
    "            s = s1\n",
    "            sP = s1P\n",
    "            state = state1\n",
    "            if d == True:\n",
    "\n",
    "                break\n",
    "\n",
    "        #Add the episode to the experience buffer\n",
    "        bufferArray = np.array(episodeBuffer)\n",
    "        episodeBuffer = list(zip(bufferArray))\n",
    "        myBuffer.add(episodeBuffer)\n",
    "        jList.append(j)\n",
    "        rList.append(rAll)\n",
    "\n",
    "        #Periodically save the model. \n",
    "        if i % 1000 == 0 and i != 0:\n",
    "            saver.save(sess,path+'/model-'+str(i)+'.cptk')\n",
    "            print (\"Saved Model\")\n",
    "        if len(rList) % summaryLength == 0 and len(rList) != 0:\n",
    "            print (total_steps,np.mean(rList[-summaryLength:]), e)\n",
    "            saveToCenter(i,rList,jList,np.reshape(np.array(episodeBuffer),[len(episodeBuffer),5]),\\\n",
    "                summaryLength,h_size,sess,mainQN,time_per_step)\n",
    "    saver.save(sess,path+'/model-'+str(i)+'.cptk')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### Testing the network"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "e = 0.01 #The chance of chosing a random action\n",
    "num_episodes = 10000 #How many episodes of game environment to train network with.\n",
    "load_model = True #Whether to load a saved model.\n",
    "path = \"./drqn\" #The path to save/load our model to/from.\n",
    "h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams.\n",
    "h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams.\n",
    "max_epLength = 50 #The max allowed length of our episode.\n",
    "time_per_step = 1 #Length of each step used in gif creation\n",
    "summaryLength = 100 #Number of epidoes to periodically save for analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "tf.reset_default_graph()\n",
    "cell = tf.contrib.rnn.BasicLSTMCell(num_units=h_size,state_is_tuple=True)\n",
    "cellT = tf.contrib.rnn.BasicLSTMCell(num_units=h_size,state_is_tuple=True)\n",
    "mainQN = Qnetwork(h_size,cell,'main')\n",
    "targetQN = Qnetwork(h_size,cellT,'target')\n",
    "\n",
    "init = tf.global_variables_initializer()\n",
    "\n",
    "saver = tf.train.Saver(max_to_keep=2)\n",
    "\n",
    "#create lists to contain total rewards and steps per episode\n",
    "jList = []\n",
    "rList = []\n",
    "total_steps = 0\n",
    "\n",
    "#Make a path for our model to be saved in.\n",
    "if not os.path.exists(path):\n",
    "    os.makedirs(path)\n",
    "\n",
    "##Write the first line of the master log-file for the Control Center\n",
    "with open('./Center/log.csv', 'w') as myfile:\n",
    "    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)\n",
    "    wr.writerow(['Episode','Length','Reward','IMG','LOG','SAL'])    \n",
    "    \n",
    "    #wr = csv.writer(open('./Center/log.csv', 'a'), quoting=csv.QUOTE_ALL)\n",
    "with tf.Session() as sess:\n",
    "    if load_model == True:\n",
    "        print ('Loading Model...')\n",
    "        ckpt = tf.train.get_checkpoint_state(path)\n",
    "        saver.restore(sess,ckpt.model_checkpoint_path)\n",
    "    else:\n",
    "        sess.run(init)\n",
    "\n",
    "        \n",
    "    for i in range(num_episodes):\n",
    "        episodeBuffer = []\n",
    "        #Reset environment and get first new observation\n",
    "        sP = env.reset()\n",
    "        s = processState(sP)\n",
    "        d = False\n",
    "        rAll = 0\n",
    "        j = 0\n",
    "        state = (np.zeros([1,h_size]),np.zeros([1,h_size]))\n",
    "        #The Q-Network\n",
    "        while j < max_epLength: #If the agent takes longer than 200 moves to reach either of the blocks, end the trial.\n",
    "            j+=1\n",
    "            #Choose an action by greedily (with e chance of random action) from the Q-network\n",
    "            if np.random.rand(1) < e:\n",
    "                state1 = sess.run(mainQN.rnn_state,\\\n",
    "                    feed_dict={mainQN.scalarInput:[s/255.0],mainQN.trainLength:1,mainQN.state_in:state,mainQN.batch_size:1})\n",
    "                a = np.random.randint(0,4)\n",
    "            else:\n",
    "                a, state1 = sess.run([mainQN.predict,mainQN.rnn_state],\\\n",
    "                    feed_dict={mainQN.scalarInput:[s/255.0],mainQN.trainLength:1,\\\n",
    "                    mainQN.state_in:state,mainQN.batch_size:1})\n",
    "                a = a[0]\n",
    "            s1P,r,d = env.step(a)\n",
    "            s1 = processState(s1P)\n",
    "            total_steps += 1\n",
    "            episodeBuffer.append(np.reshape(np.array([s,a,r,s1,d]),[1,5])) #Save the experience to our episode buffer.\n",
    "            rAll += r\n",
    "            s = s1\n",
    "            sP = s1P\n",
    "            state = state1\n",
    "            if d == True:\n",
    "\n",
    "                break\n",
    "\n",
    "        bufferArray = np.array(episodeBuffer)\n",
    "        jList.append(j)\n",
    "        rList.append(rAll)\n",
    "\n",
    "        #Periodically save the model. \n",
    "        if len(rList) % summaryLength == 0 and len(rList) != 0:\n",
    "            print (total_steps,np.mean(rList[-summaryLength:]), e)\n",
    "            saveToCenter(i,rList,jList,np.reshape(np.array(episodeBuffer),[len(episodeBuffer),5]),\\\n",
    "                summaryLength,h_size,sess,mainQN,time_per_step)\n",
    "print (\"Percent of succesful episodes: \" + str(sum(rList)/num_episodes) + \"%\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}

## gridworld.py
import numpy as np
import random
import itertools
import scipy.misc
import matplotlib.pyplot as plt


class gameOb():
    def __init__(self,coordinates,size,intensity,channel,reward,name):
        self.x = coordinates[0]
        self.y = coordinates[1]
        self.size = size
        self.intensity = intensity
        self.channel = channel
        self.reward = reward
        self.name = name

class gameEnv():
    def __init__(self,partial,size):
        self.sizeX = size
        self.sizeY = size
        self.actions = 4
        self.objects = []
        self.partial = partial
        a = self.reset()
        plt.imshow(a,interpolation="nearest")


    def reset(self):
        self.objects = []
        hero = gameOb(self.newPosition(),1,1,2,None,'hero')
        self.objects.append(hero)
        bug = gameOb(self.newPosition(),1,1,1,1,'goal')
        self.objects.append(bug)
        hole = gameOb(self.newPosition(),1,1,0,-1,'fire')
        self.objects.append(hole)
        bug2 = gameOb(self.newPosition(),1,1,1,1,'goal')
        self.objects.append(bug2)
        hole2 = gameOb(self.newPosition(),1,1,0,-1,'fire')
        self.objects.append(hole2)
        bug3 = gameOb(self.newPosition(),1,1,1,1,'goal')
        self.objects.append(bug3)
        bug4 = gameOb(self.newPosition(),1,1,1,1,'goal')
        self.objects.append(bug4)
        state = self.renderEnv()
        self.state = state
        return state

    def moveChar(self,direction):
        # 0 - up, 1 - down, 2 - left, 3 - right
        hero = self.objects[0]
        heroX = hero.x
        heroY = hero.y
        penalize = 0.
        if direction == 0 and hero.y >= 1:
            hero.y -= 1
        if direction == 1 and hero.y <= self.sizeY-2:
            hero.y += 1
        if direction == 2 and hero.x >= 1:
            hero.x -= 1
        if direction == 3 and hero.x <= self.sizeX-2:
            hero.x += 1
        if hero.x == heroX and hero.y == heroY:
            penalize = 0.0
        self.objects[0] = hero
        return penalize

    def newPosition(self):
        iterables = [ range(self.sizeX), range(self.sizeY)]
        points = []
        for t in itertools.product(*iterables):
            points.append(t)
        currentPositions = []
        for objectA in self.objects:
            if (objectA.x,objectA.y) not in currentPositions:
                currentPositions.append((objectA.x,objectA.y))
        for pos in currentPositions:
            points.remove(pos)
        location = np.random.choice(range(len(points)),replace=False)
        return points[location]

    def checkGoal(self):
        others = []
        for obj in self.objects:
            if obj.name == 'hero':
                hero = obj
            else:
                others.append(obj)
        ended = False
        for other in others:
            if hero.x == other.x and hero.y == other.y:
                self.objects.remove(other)
                if other.reward == 1:
                    self.objects.append(gameOb(self.newPosition(),1,1,1,1,'goal'))
                else:
                    self.objects.append(gameOb(self.newPosition(),1,1,0,-1,'fire'))
                return other.reward,False
        if ended == False:
            return 0.0,False

    def renderEnv(self):
        #a = np.zeros([self.sizeY,self.sizeX,3])
        a = np.ones([self.sizeY+2,self.sizeX+2,3])
        a[1:-1,1:-1,:] = 0
        hero = None
        for item in self.objects:
            a[item.y+1:item.y+item.size+1,item.x+1:item.x+item.size+1,item.channel] = item.intensity
            if item.name == 'hero':
                hero = item
        if self.partial == True:
            a = a[hero.y:hero.y+3,hero.x:hero.x+3,:]
        b = scipy.misc.imresize(a[:,:,0],[84,84,1],interp='nearest')
        c = scipy.misc.imresize(a[:,:,1],[84,84,1],interp='nearest')
        d = scipy.misc.imresize(a[:,:,2],[84,84,1],interp='nearest')
        a = np.stack([b,c,d],axis=2)
        return a

    def step(self,action):
        penalty = self.moveChar(action)
        reward,done = self.checkGoal()
        state = self.renderEnv()
        if reward == None:
            print(done)
            print(reward)
            print(penalty)
            return state,(reward+penalty),done
        else:
            return state,(reward+penalty),done

## helper.py
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
import scipy.misc
import os
import csv
import itertools
import tensorflow.contrib.slim as slim

#This is a simple function to reshape our game frames.
def processState(state1):
    return np.reshape(state1,[21168])

#These functions allows us to update the parameters of our target network with those of the primary network.
def updateTargetGraph(tfVars,tau):
    total_vars = len(tfVars)
    op_holder = []
    for idx,var in enumerate(tfVars[0:total_vars//2]):
        op_holder.append(tfVars[idx+total_vars//2].assign((var.value()*tau) + ((1-tau)*tfVars[idx+total_vars//2].value())))
    return op_holder

def updateTarget(op_holder,sess):
    for op in op_holder:
        sess.run(op)
    total_vars = len(tf.trainable_variables())
    a = tf.trainable_variables()[0].eval(session=sess)
    b = tf.trainable_variables()[total_vars//2].eval(session=sess)
    if a.all() == b.all():
        print("Target Set Success")
    else:
        print("Target Set Failed")

#Record performance metrics and episode logs for the Control Center.
def saveToCenter(i,rList,jList,bufferArray,summaryLength,h_size,sess,mainQN,time_per_step):
    with open('./Center/log.csv', 'a') as myfile:
        state_display = (np.zeros([1,h_size]),np.zeros([1,h_size]))
        imagesS = []
        for idx,z in enumerate(np.vstack(bufferArray[:,0])):
            img,state_display = sess.run([mainQN.salience,mainQN.rnn_state],\
                feed_dict={mainQN.scalarInput:np.reshape(bufferArray[idx,0],[1,21168])/255.0,\
                mainQN.trainLength:1,mainQN.state_in:state_display,mainQN.batch_size:1})
            imagesS.append(img)
        imagesS = (imagesS - np.min(imagesS))/(np.max(imagesS) - np.min(imagesS))
        imagesS = np.vstack(imagesS)
        imagesS = np.resize(imagesS,[len(imagesS),84,84,3])
        luminance = np.max(imagesS,3)
        imagesS = np.multiply(np.ones([len(imagesS),84,84,3]),np.reshape(luminance,[len(imagesS),84,84,1]))
        make_gif(np.ones([len(imagesS),84,84,3]),'./Center/frames/sal'+str(i)+'.gif',duration=len(imagesS)*time_per_step,true_image=False,salience=True,salIMGS=luminance)

        images = zip(bufferArray[:,0])
        images.append(bufferArray[-1,3])
        images = np.vstack(images)
        images = np.resize(images,[len(images),84,84,3])
        make_gif(images,'./Center/frames/image'+str(i)+'.gif',duration=len(images)*time_per_step,true_image=True,salience=False)

        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerow([i,np.mean(jList[-100:]),np.mean(rList[-summaryLength:]),'./frames/image'+str(i)+'.gif','./frames/log'+str(i)+'.csv','./frames/sal'+str(i)+'.gif'])
        myfile.close()
    with open('./Center/frames/log'+str(i)+'.csv','w') as myfile:
        state_train = (np.zeros([1,h_size]),np.zeros([1,h_size]))
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerow(["ACTION","REWARD","A0","A1",'A2','A3','V'])
        a, v = sess.run([mainQN.Advantage,mainQN.Value],\
            feed_dict={mainQN.scalarInput:np.vstack(bufferArray[:,0])/255.0,mainQN.trainLength:len(bufferArray),mainQN.state_in:state_train,mainQN.batch_size:1})
        wr.writerows(zip(bufferArray[:,1],bufferArray[:,2],a[:,0],a[:,1],a[:,2],a[:,3],v[:,0]))

#This code allows gifs to be saved of the training episode for use in the Control Center.
def make_gif(images, fname, duration=2, true_image=False,salience=False,salIMGS=None):
  import moviepy.editor as mpy

  def make_frame(t):
    try:
      x = images[int(len(images)/duration*t)]
    except:
      x = images[-1]

    if true_image:
      return x.astype(np.uint8)
    else:
      return ((x+1)/2*255).astype(np.uint8)

  def make_mask(t):
    try:
      x = salIMGS[int(len(salIMGS)/duration*t)]
    except:
      x = salIMGS[-1]
    return x

  clip = mpy.VideoClip(make_frame, duration=duration)
  if salience == True:
    mask = mpy.VideoClip(make_mask, ismask=True,duration= duration)
    clipB = clip.set_mask(mask)
    clipB = clip.set_opacity(0)
    mask = mask.set_opacity(0.1)
    mask.write_gif(fname, fps = len(images) / duration,verbose=False)
    #clipB.write_gif(fname, fps = len(images) / duration,verbose=False)
  else:
    clip.write_gif(fname, fps = len(images) / duration,verbose=False)
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Deep Recurrent Q-Network \n",
	"This notebook provides an example implementation of a Deep Recurrent Q-Network which can solve Partially Observable Markov Decision Processes. To learn more about DRQNs, see my blog post on them here: https://medium.com/p/68463e9aeefc .\n",
	"\n",
	"For more reinforcment learning tutorials, as well as the additional required `gridworld.py` and `helper.py` see:\n",
	"https://github.com/awjuliani/DeepRL-Agents"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import random\n",
	"import tensorflow as tf\n",
	"import matplotlib.pyplot as plt\n",
	"import scipy.misc\n",
	"import os\n",
	"import csv\n",
	"import itertools\n",
	"import tensorflow.contrib.slim as slim\n",
	"%matplotlib inline\n",
	"\n",
	"from helper import *"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Load the game environment"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from gridworld import gameEnv"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Feel free to adjust the size of the gridworld. Making it smaller (adjusting `size`) provides an easier task for our DRQN agent, while making the world larger increases the challenge.\n",
	"\n",
	"Initializing the Gridworld with `True` limits the field of view, resulting in a partially observable MDP. Initializing it with `False` provides the agent with the entire environment, resulting in a fully MDP."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"env = gameEnv(partial=False,size=9)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"env = gameEnv(partial=True,size=9)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Above are examples of a starting environment in our simple game. The agent controls the blue square, and can move up, down, left, or right. The goal is to move to the green squares (for +1 reward) and avoid the red squares (for -1 reward). When the agent moves through a green or red square, it is randomly moved to a new place in the environment."
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Implementing the network itself"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"class Qnetwork():\n",
	" def __init__(self,h_size,rnn_cell,myScope):\n",
	" #The network recieves a frame from the game, flattened into an array.\n",
	" #It then resizes it and processes it through four convolutional layers.\n",
	" self.scalarInput = tf.placeholder(shape=[None,21168],dtype=tf.float32)\n",
	" self.imageIn = tf.reshape(self.scalarInput,shape=[-1,84,84,3])\n",
	" self.conv1 = slim.convolution2d( \\\n",
	" inputs=self.imageIn,num_outputs=32,\\\n",
	" kernel_size=[8,8],stride=[4,4],padding='VALID', \\\n",
	" biases_initializer=None,scope=myScope+'_conv1')\n",
	" self.conv2 = slim.convolution2d( \\\n",
	" inputs=self.conv1,num_outputs=64,\\\n",
	" kernel_size=[4,4],stride=[2,2],padding='VALID', \\\n",
	" biases_initializer=None,scope=myScope+'_conv2')\n",
	" self.conv3 = slim.convolution2d( \\\n",
	" inputs=self.conv2,num_outputs=64,\\\n",
	" kernel_size=[3,3],stride=[1,1],padding='VALID', \\\n",
	" biases_initializer=None,scope=myScope+'_conv3')\n",
	" self.conv4 = slim.convolution2d( \\\n",
	" inputs=self.conv3,num_outputs=h_size,\\\n",
	" kernel_size=[7,7],stride=[1,1],padding='VALID', \\\n",
	" biases_initializer=None,scope=myScope+'_conv4')\n",
	" \n",
	" self.trainLength = tf.placeholder(dtype=tf.int32)\n",
	" #We take the output from the final convolutional layer and send it to a recurrent layer.\n",
	" #The input must be reshaped into [batch x trace x units] for rnn processing, \n",
	" #and then returned to [batch x units] when sent through the upper levles.\n",
	" self.batch_size = tf.placeholder(dtype=tf.int32,shape=[])\n",
	" self.convFlat = tf.reshape(slim.flatten(self.conv4),[self.batch_size,self.trainLength,h_size])\n",
	" self.state_in = rnn_cell.zero_state(self.batch_size, tf.float32)\n",
	" self.rnn,self.rnn_state = tf.nn.dynamic_rnn(\\\n",
	" inputs=self.convFlat,cell=rnn_cell,dtype=tf.float32,initial_state=self.state_in,scope=myScope+'_rnn')\n",
	" self.rnn = tf.reshape(self.rnn,shape=[-1,h_size])\n",
	" #The output from the recurrent player is then split into separate Value and Advantage streams\n",
	" self.streamA,self.streamV = tf.split(self.rnn,2,1)\n",
	" self.AW = tf.Variable(tf.random_normal([h_size//2,4]))\n",
	" self.VW = tf.Variable(tf.random_normal([h_size//2,1]))\n",
	" self.Advantage = tf.matmul(self.streamA,self.AW)\n",
	" self.Value = tf.matmul(self.streamV,self.VW)\n",
	" \n",
	" self.salience = tf.gradients(self.Advantage,self.imageIn)\n",
	" #Then combine them together to get our final Q-values.\n",
	" self.Qout = self.Value + tf.subtract(self.Advantage,tf.reduce_mean(self.Advantage,axis=1,keep_dims=True))\n",
	" self.predict = tf.argmax(self.Qout,1)\n",
	" \n",
	" #Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.\n",
	" self.targetQ = tf.placeholder(shape=[None],dtype=tf.float32)\n",
	" self.actions = tf.placeholder(shape=[None],dtype=tf.int32)\n",
	" self.actions_onehot = tf.one_hot(self.actions,4,dtype=tf.float32)\n",
	" \n",
	" self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1)\n",
	" \n",
	" self.td_error = tf.square(self.targetQ - self.Q)\n",
	" \n",
	" #In order to only propogate accurate gradients through the network, we will mask the first\n",
	" #half of the losses for each trace as per Lample & Chatlot 2016\n",
	" self.maskA = tf.zeros([self.batch_size,self.trainLength//2])\n",
	" self.maskB = tf.ones([self.batch_size,self.trainLength//2])\n",
	" self.mask = tf.concat([self.maskA,self.maskB],1)\n",
	" self.mask = tf.reshape(self.mask,[-1])\n",
	" self.loss = tf.reduce_mean(self.td_error * self.mask)\n",
	" \n",
	" self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001)\n",
	" self.updateModel = self.trainer.minimize(self.loss)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Experience Replay"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"These classes allow us to store experies and sample then randomly to train the network.\n",
	"Episode buffer stores experiences for each individal episode.\n",
	"Experience buffer stores entire episodes of experience, and sample() allows us to get training batches needed from the network."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"class experience_buffer():\n",
	" def __init__(self, buffer_size = 1000):\n",
	" self.buffer = []\n",
	" self.buffer_size = buffer_size\n",
	" \n",
	" def add(self,experience):\n",
	" if len(self.buffer) + 1 >= self.buffer_size:\n",
	" self.buffer[0:(1+len(self.buffer))-self.buffer_size] = []\n",
	" self.buffer.append(experience)\n",
	" \n",
	" def sample(self,batch_size,trace_length):\n",
	" sampled_episodes = random.sample(self.buffer,batch_size)\n",
	" sampledTraces = []\n",
	" for episode in sampled_episodes:\n",
	" point = np.random.randint(0,len(episode)+1-trace_length)\n",
	" sampledTraces.append(episode[point:point+trace_length])\n",
	" sampledTraces = np.array(sampledTraces)\n",
	" return np.reshape(sampledTraces,[batch_size*trace_length,5])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Training the network"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#Setting the training parameters\n",
	"batch_size = 4 #How many experience traces to use for each training step.\n",
	"trace_length = 8 #How long each experience trace will be when training\n",
	"update_freq = 5 #How often to perform a training step.\n",
	"y = .99 #Discount factor on the target Q-values\n",
	"startE = 1 #Starting chance of random action\n",
	"endE = 0.1 #Final chance of random action\n",
	"anneling_steps = 10000 #How many steps of training to reduce startE to endE.\n",
	"num_episodes = 10000 #How many episodes of game environment to train network with.\n",
	"pre_train_steps = 10000 #How many steps of random actions before training begins.\n",
	"load_model = False #Whether to load a saved model.\n",
	"path = \"./drqn\" #The path to save our model to.\n",
	"h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams.\n",
	"max_epLength = 50 #The max allowed length of our episode.\n",
	"time_per_step = 1 #Length of each step used in gif creation\n",
	"summaryLength = 100 #Number of epidoes to periodically save for analysis\n",
	"tau = 0.001"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"tf.reset_default_graph()\n",
	"#We define the cells for the primary and target q-networks\n",
	"cell = tf.contrib.rnn.BasicLSTMCell(num_units=h_size,state_is_tuple=True)\n",
	"cellT = tf.contrib.rnn.BasicLSTMCell(num_units=h_size,state_is_tuple=True)\n",
	"mainQN = Qnetwork(h_size,cell,'main')\n",
	"targetQN = Qnetwork(h_size,cellT,'target')\n",
	"\n",
	"init = tf.global_variables_initializer()\n",
	"\n",
	"saver = tf.train.Saver(max_to_keep=5)\n",
	"\n",
	"trainables = tf.trainable_variables()\n",
	"\n",
	"targetOps = updateTargetGraph(trainables,tau)\n",
	"\n",
	"myBuffer = experience_buffer()\n",
	"\n",
	"#Set the rate of random action decrease. \n",
	"e = startE\n",
	"stepDrop = (startE - endE)/anneling_steps\n",
	"\n",
	"#create lists to contain total rewards and steps per episode\n",
	"jList = []\n",
	"rList = []\n",
	"total_steps = 0\n",
	"\n",
	"#Make a path for our model to be saved in.\n",
	"if not os.path.exists(path):\n",
	" os.makedirs(path)\n",
	"\n",
	"##Write the first line of the master log-file for the Control Center\n",
	"with open('./Center/log.csv', 'w') as myfile:\n",
	" wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)\n",
	" wr.writerow(['Episode','Length','Reward','IMG','LOG','SAL']) \n",
	" \n",
	"\n",
	"with tf.Session() as sess:\n",
	" if load_model == True:\n",
	" print ('Loading Model...')\n",
	" ckpt = tf.train.get_checkpoint_state(path)\n",
	" saver.restore(sess,ckpt.model_checkpoint_path)\n",
	" sess.run(init)\n",
	" \n",
	" updateTarget(targetOps,sess) #Set the target network to be equal to the primary network.\n",
	" for i in range(num_episodes):\n",
	" episodeBuffer = []\n",
	" #Reset environment and get first new observation\n",
	" sP = env.reset()\n",
	" s = processState(sP)\n",
	" d = False\n",
	" rAll = 0\n",
	" j = 0\n",
	" state = (np.zeros([1,h_size]),np.zeros([1,h_size])) #Reset the recurrent layer's hidden state\n",
	" #The Q-Network\n",
	" while j < max_epLength: \n",
	" j+=1\n",
	" #Choose an action by greedily (with e chance of random action) from the Q-network\n",
	" if np.random.rand(1) < e or total_steps < pre_train_steps:\n",
	" state1 = sess.run(mainQN.rnn_state,\\\n",
	" feed_dict={mainQN.scalarInput:[s/255.0],mainQN.trainLength:1,mainQN.state_in:state,mainQN.batch_size:1})\n",
	" a = np.random.randint(0,4)\n",
	" else:\n",
	" a, state1 = sess.run([mainQN.predict,mainQN.rnn_state],\\\n",
	" feed_dict={mainQN.scalarInput:[s/255.0],mainQN.trainLength:1,mainQN.state_in:state,mainQN.batch_size:1})\n",
	" a = a[0]\n",
	" s1P,r,d = env.step(a)\n",
	" s1 = processState(s1P)\n",
	" total_steps += 1\n",
	" episodeBuffer.append(np.reshape(np.array([s,a,r,s1,d]),[1,5]))\n",
	" if total_steps > pre_train_steps:\n",
	" if e > endE:\n",
	" e -= stepDrop\n",
	"\n",
	" if total_steps % (update_freq) == 0:\n",
	" updateTarget(targetOps,sess)\n",
	" #Reset the recurrent layer's hidden state\n",
	" state_train = (np.zeros([batch_size,h_size]),np.zeros([batch_size,h_size])) \n",
	" \n",
	" trainBatch = myBuffer.sample(batch_size,trace_length) #Get a random batch of experiences.\n",
	" #Below we perform the Double-DQN update to the target Q-values\n",
	" Q1 = sess.run(mainQN.predict,feed_dict={\\\n",
	" mainQN.scalarInput:np.vstack(trainBatch[:,3]/255.0),\\\n",
	" mainQN.trainLength:trace_length,mainQN.state_in:state_train,mainQN.batch_size:batch_size})\n",
	" Q2 = sess.run(targetQN.Qout,feed_dict={\\\n",
	" targetQN.scalarInput:np.vstack(trainBatch[:,3]/255.0),\\\n",
	" targetQN.trainLength:trace_length,targetQN.state_in:state_train,targetQN.batch_size:batch_size})\n",
	" end_multiplier = -(trainBatch[:,4] - 1)\n",
	" doubleQ = Q2[range(batch_size*trace_length),Q1]\n",
	" targetQ = trainBatch[:,2] + (ydoubleQ end_multiplier)\n",
	" #Update the network with our target values.\n",
	" sess.run(mainQN.updateModel, \\\n",
	" feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,0]/255.0),mainQN.targetQ:targetQ,\\\n",
	" mainQN.actions:trainBatch[:,1],mainQN.trainLength:trace_length,\\\n",
	" mainQN.state_in:state_train,mainQN.batch_size:batch_size})\n",
	" rAll += r\n",
	" s = s1\n",
	" sP = s1P\n",
	" state = state1\n",
	" if d == True:\n",
	"\n",
	" break\n",
	"\n",
	" #Add the episode to the experience buffer\n",
	" bufferArray = np.array(episodeBuffer)\n",
	" episodeBuffer = list(zip(bufferArray))\n",
	" myBuffer.add(episodeBuffer)\n",
	" jList.append(j)\n",
	" rList.append(rAll)\n",
	"\n",
	" #Periodically save the model. \n",
	" if i % 1000 == 0 and i != 0:\n",
	" saver.save(sess,path+'/model-'+str(i)+'.cptk')\n",
	" print (\"Saved Model\")\n",
	" if len(rList) % summaryLength == 0 and len(rList) != 0:\n",
	" print (total_steps,np.mean(rList[-summaryLength:]), e)\n",
	" saveToCenter(i,rList,jList,np.reshape(np.array(episodeBuffer),[len(episodeBuffer),5]),\\\n",
	" summaryLength,h_size,sess,mainQN,time_per_step)\n",
	" saver.save(sess,path+'/model-'+str(i)+'.cptk')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"### Testing the network"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"e = 0.01 #The chance of chosing a random action\n",
	"num_episodes = 10000 #How many episodes of game environment to train network with.\n",
	"load_model = True #Whether to load a saved model.\n",
	"path = \"./drqn\" #The path to save/load our model to/from.\n",
	"h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams.\n",
	"h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams.\n",
	"max_epLength = 50 #The max allowed length of our episode.\n",
	"time_per_step = 1 #Length of each step used in gif creation\n",
	"summaryLength = 100 #Number of epidoes to periodically save for analysis"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"tf.reset_default_graph()\n",
	"cell = tf.contrib.rnn.BasicLSTMCell(num_units=h_size,state_is_tuple=True)\n",
	"cellT = tf.contrib.rnn.BasicLSTMCell(num_units=h_size,state_is_tuple=True)\n",
	"mainQN = Qnetwork(h_size,cell,'main')\n",
	"targetQN = Qnetwork(h_size,cellT,'target')\n",
	"\n",
	"init = tf.global_variables_initializer()\n",
	"\n",
	"saver = tf.train.Saver(max_to_keep=2)\n",
	"\n",
	"#create lists to contain total rewards and steps per episode\n",
	"jList = []\n",
	"rList = []\n",
	"total_steps = 0\n",
	"\n",
	"#Make a path for our model to be saved in.\n",
	"if not os.path.exists(path):\n",
	" os.makedirs(path)\n",
	"\n",
	"##Write the first line of the master log-file for the Control Center\n",
	"with open('./Center/log.csv', 'w') as myfile:\n",
	" wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)\n",
	" wr.writerow(['Episode','Length','Reward','IMG','LOG','SAL']) \n",
	" \n",
	" #wr = csv.writer(open('./Center/log.csv', 'a'), quoting=csv.QUOTE_ALL)\n",
	"with tf.Session() as sess:\n",
	" if load_model == True:\n",
	" print ('Loading Model...')\n",
	" ckpt = tf.train.get_checkpoint_state(path)\n",
	" saver.restore(sess,ckpt.model_checkpoint_path)\n",
	" else:\n",
	" sess.run(init)\n",
	"\n",
	" \n",
	" for i in range(num_episodes):\n",
	" episodeBuffer = []\n",
	" #Reset environment and get first new observation\n",
	" sP = env.reset()\n",
	" s = processState(sP)\n",
	" d = False\n",
	" rAll = 0\n",
	" j = 0\n",
	" state = (np.zeros([1,h_size]),np.zeros([1,h_size]))\n",
	" #The Q-Network\n",
	" while j < max_epLength: #If the agent takes longer than 200 moves to reach either of the blocks, end the trial.\n",
	" j+=1\n",
	" #Choose an action by greedily (with e chance of random action) from the Q-network\n",
	" if np.random.rand(1) < e:\n",
	" state1 = sess.run(mainQN.rnn_state,\\\n",
	" feed_dict={mainQN.scalarInput:[s/255.0],mainQN.trainLength:1,mainQN.state_in:state,mainQN.batch_size:1})\n",
	" a = np.random.randint(0,4)\n",
	" else:\n",
	" a, state1 = sess.run([mainQN.predict,mainQN.rnn_state],\\\n",
	" feed_dict={mainQN.scalarInput:[s/255.0],mainQN.trainLength:1,\\\n",
	" mainQN.state_in:state,mainQN.batch_size:1})\n",
	" a = a[0]\n",
	" s1P,r,d = env.step(a)\n",
	" s1 = processState(s1P)\n",
	" total_steps += 1\n",
	" episodeBuffer.append(np.reshape(np.array([s,a,r,s1,d]),[1,5])) #Save the experience to our episode buffer.\n",
	" rAll += r\n",
	" s = s1\n",
	" sP = s1P\n",
	" state = state1\n",
	" if d == True:\n",
	"\n",
	" break\n",
	"\n",
	" bufferArray = np.array(episodeBuffer)\n",
	" jList.append(j)\n",
	" rList.append(rAll)\n",
	"\n",
	" #Periodically save the model. \n",
	" if len(rList) % summaryLength == 0 and len(rList) != 0:\n",
	" print (total_steps,np.mean(rList[-summaryLength:]), e)\n",
	" saveToCenter(i,rList,jList,np.reshape(np.array(episodeBuffer),[len(episodeBuffer),5]),\\\n",
	" summaryLength,h_size,sess,mainQN,time_per_step)\n",
	"print (\"Percent of succesful episodes: \" + str(sum(rList)/num_episodes) + \"%\")"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.11"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}
	import numpy as np
	import random
	import itertools
	import scipy.misc
	import matplotlib.pyplot as plt


	class gameOb():
	def __init__(self,coordinates,size,intensity,channel,reward,name):
	self.x = coordinates[0]
	self.y = coordinates[1]
	self.size = size
	self.intensity = intensity
	self.channel = channel
	self.reward = reward
	self.name = name

	class gameEnv():
	def __init__(self,partial,size):
	self.sizeX = size
	self.sizeY = size
	self.actions = 4
	self.objects = []
	self.partial = partial
	a = self.reset()
	plt.imshow(a,interpolation="nearest")


	def reset(self):
	self.objects = []
	hero = gameOb(self.newPosition(),1,1,2,None,'hero')
	self.objects.append(hero)
	bug = gameOb(self.newPosition(),1,1,1,1,'goal')
	self.objects.append(bug)
	hole = gameOb(self.newPosition(),1,1,0,-1,'fire')
	self.objects.append(hole)
	bug2 = gameOb(self.newPosition(),1,1,1,1,'goal')
	self.objects.append(bug2)
	hole2 = gameOb(self.newPosition(),1,1,0,-1,'fire')
	self.objects.append(hole2)
	bug3 = gameOb(self.newPosition(),1,1,1,1,'goal')
	self.objects.append(bug3)
	bug4 = gameOb(self.newPosition(),1,1,1,1,'goal')
	self.objects.append(bug4)
	state = self.renderEnv()
	self.state = state
	return state

	def moveChar(self,direction):
	# 0 - up, 1 - down, 2 - left, 3 - right
	hero = self.objects[0]
	heroX = hero.x
	heroY = hero.y
	penalize = 0.
	if direction == 0 and hero.y >= 1:
	hero.y -= 1
	if direction == 1 and hero.y <= self.sizeY-2:
	hero.y += 1
	if direction == 2 and hero.x >= 1:
	hero.x -= 1
	if direction == 3 and hero.x <= self.sizeX-2:
	hero.x += 1
	if hero.x == heroX and hero.y == heroY:
	penalize = 0.0
	self.objects[0] = hero
	return penalize

	def newPosition(self):
	iterables = [ range(self.sizeX), range(self.sizeY)]
	points = []
	for t in itertools.product(*iterables):
	points.append(t)
	currentPositions = []
	for objectA in self.objects:
	if (objectA.x,objectA.y) not in currentPositions:
	currentPositions.append((objectA.x,objectA.y))
	for pos in currentPositions:
	points.remove(pos)
	location = np.random.choice(range(len(points)),replace=False)
	return points[location]

	def checkGoal(self):
	others = []
	for obj in self.objects:
	if obj.name == 'hero':
	hero = obj
	else:
	others.append(obj)
	ended = False
	for other in others:
	if hero.x == other.x and hero.y == other.y:
	self.objects.remove(other)
	if other.reward == 1:
	self.objects.append(gameOb(self.newPosition(),1,1,1,1,'goal'))
	else:
	self.objects.append(gameOb(self.newPosition(),1,1,0,-1,'fire'))
	return other.reward,False
	if ended == False:
	return 0.0,False

	def renderEnv(self):
	#a = np.zeros([self.sizeY,self.sizeX,3])
	a = np.ones([self.sizeY+2,self.sizeX+2,3])
	a[1:-1,1:-1,:] = 0
	hero = None
	for item in self.objects:
	a[item.y+1:item.y+item.size+1,item.x+1:item.x+item.size+1,item.channel] = item.intensity
	if item.name == 'hero':
	hero = item
	if self.partial == True:
	a = a[hero.y:hero.y+3,hero.x:hero.x+3,:]
	b = scipy.misc.imresize(a[:,:,0],[84,84,1],interp='nearest')
	c = scipy.misc.imresize(a[:,:,1],[84,84,1],interp='nearest')
	d = scipy.misc.imresize(a[:,:,2],[84,84,1],interp='nearest')
	a = np.stack([b,c,d],axis=2)
	return a

	def step(self,action):
	penalty = self.moveChar(action)
	reward,done = self.checkGoal()
	state = self.renderEnv()
	if reward == None:
	print(done)
	print(reward)
	print(penalty)
	return state,(reward+penalty),done
	else:
	return state,(reward+penalty),done
	import numpy as np
	import random
	import tensorflow as tf
	import matplotlib.pyplot as plt
	import scipy.misc
	import os
	import csv
	import itertools
	import tensorflow.contrib.slim as slim

	#This is a simple function to reshape our game frames.
	def processState(state1):
	return np.reshape(state1,[21168])

	#These functions allows us to update the parameters of our target network with those of the primary network.
	def updateTargetGraph(tfVars,tau):
	total_vars = len(tfVars)
	op_holder = []
	for idx,var in enumerate(tfVars[0:total_vars//2]):
	op_holder.append(tfVars[idx+total_vars//2].assign((var.value()tau) + ((1-tau)tfVars[idx+total_vars//2].value())))
	return op_holder

	def updateTarget(op_holder,sess):
	for op in op_holder:
	sess.run(op)
	total_vars = len(tf.trainable_variables())
	a = tf.trainable_variables()[0].eval(session=sess)
	b = tf.trainable_variables()[total_vars//2].eval(session=sess)
	if a.all() == b.all():
	print("Target Set Success")
	else:
	print("Target Set Failed")

	#Record performance metrics and episode logs for the Control Center.
	def saveToCenter(i,rList,jList,bufferArray,summaryLength,h_size,sess,mainQN,time_per_step):
	with open('./Center/log.csv', 'a') as myfile:
	state_display = (np.zeros([1,h_size]),np.zeros([1,h_size]))
	imagesS = []
	for idx,z in enumerate(np.vstack(bufferArray[:,0])):
	img,state_display = sess.run([mainQN.salience,mainQN.rnn_state],\
	feed_dict={mainQN.scalarInput:np.reshape(bufferArray[idx,0],[1,21168])/255.0,\
	mainQN.trainLength:1,mainQN.state_in:state_display,mainQN.batch_size:1})
	imagesS.append(img)
	imagesS = (imagesS - np.min(imagesS))/(np.max(imagesS) - np.min(imagesS))
	imagesS = np.vstack(imagesS)
	imagesS = np.resize(imagesS,[len(imagesS),84,84,3])
	luminance = np.max(imagesS,3)
	imagesS = np.multiply(np.ones([len(imagesS),84,84,3]),np.reshape(luminance,[len(imagesS),84,84,1]))
	make_gif(np.ones([len(imagesS),84,84,3]),'./Center/frames/sal'+str(i)+'.gif',duration=len(imagesS)*time_per_step,true_image=False,salience=True,salIMGS=luminance)

	images = zip(bufferArray[:,0])
	images.append(bufferArray[-1,3])
	images = np.vstack(images)
	images = np.resize(images,[len(images),84,84,3])
	make_gif(images,'./Center/frames/image'+str(i)+'.gif',duration=len(images)*time_per_step,true_image=True,salience=False)

	wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
	wr.writerow([i,np.mean(jList[-100:]),np.mean(rList[-summaryLength:]),'./frames/image'+str(i)+'.gif','./frames/log'+str(i)+'.csv','./frames/sal'+str(i)+'.gif'])
	myfile.close()
	with open('./Center/frames/log'+str(i)+'.csv','w') as myfile:
	state_train = (np.zeros([1,h_size]),np.zeros([1,h_size]))
	wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
	wr.writerow(["ACTION","REWARD","A0","A1",'A2','A3','V'])
	a, v = sess.run([mainQN.Advantage,mainQN.Value],\
	feed_dict={mainQN.scalarInput:np.vstack(bufferArray[:,0])/255.0,mainQN.trainLength:len(bufferArray),mainQN.state_in:state_train,mainQN.batch_size:1})
	wr.writerows(zip(bufferArray[:,1],bufferArray[:,2],a[:,0],a[:,1],a[:,2],a[:,3],v[:,0]))

	#This code allows gifs to be saved of the training episode for use in the Control Center.
	def make_gif(images, fname, duration=2, true_image=False,salience=False,salIMGS=None):
	import moviepy.editor as mpy

	def make_frame(t):
	try:
	x = images[int(len(images)/duration*t)]
	except:
	x = images[-1]

	if true_image:
	return x.astype(np.uint8)
	else:
	return ((x+1)/2*255).astype(np.uint8)

	def make_mask(t):
	try:
	x = salIMGS[int(len(salIMGS)/duration*t)]
	except:
	x = salIMGS[-1]
	return x

	clip = mpy.VideoClip(make_frame, duration=duration)
	if salience == True:
	mask = mpy.VideoClip(make_mask, ismask=True,duration= duration)
	clipB = clip.set_mask(mask)
	clipB = clip.set_opacity(0)
	mask = mask.set_opacity(0.1)
	mask.write_gif(fname, fps = len(images) / duration,verbose=False)
	#clipB.write_gif(fname, fps = len(images) / duration,verbose=False)
	else:
	clip.write_gif(fname, fps = len(images) / duration,verbose=False)