pat-coady/jacks_car_rental_policy_iteration.ipynb

## jacks_car_rental_policy_iteration.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Jack's Car Rental: Policy Iteration\n",
    "Example 4.2 from *Reinforcement Learning: An Introduction* by Sutton and Barto.\n",
    "\n",
    "This example from Sutton and Barto's excellent book is trickier to implement than you might think. I've built 2 versions: one using policy iteration, and the other with value iteration.\n",
    "\n",
    "For policy iteration, note how policy evaluation becomes shorter and shorter as the policy converges. It is also interesting to note that value iteration is slower than policy iteration in this example. Value iteration must check all possible actions from every state in each iteration.\n",
    "\n",
    "Both methods converge to the result shown in the textbook.\n",
    "\n",
    "Python Notebook by Patrick Coady: [Learning Artificial Intelligence](https://learningai.io/)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import math\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def clipped_poisson(lam, max_k):\n",
    "    \"\"\"\n",
    "    Return poisson PMF clipped at max_k with remaining tail probability\n",
    "    placed at max_k.\n",
    "    \"\"\"\n",
    "    pmf = np.zeros(max_k + 1)\n",
    "    for k in range(max_k):\n",
    "        pmf[k] = math.exp(-lam) * lam**k / math.factorial(k)\n",
    "    pmf[max_k] = 1 - np.sum(pmf)\n",
    "    \n",
    "    return pmf       "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def build_rent_return_pmf(lambda_request, lambda_return, max_cars):\n",
    "    \"\"\"\n",
    "    Return p(new_rentals, returns | initial_cars) as numpy array:\n",
    "        p[initial_cars, new_rentals, returns]\n",
    "    \"\"\"\n",
    "    pmf = np.zeros((max_cars+1, max_cars+1, max_cars+1))\n",
    "    \n",
    "    for init_cars in range(max_cars + 1):\n",
    "        new_rentals_pmf = clipped_poisson(lambda_request, init_cars)\n",
    "        for new_rentals in range(init_cars + 1):\n",
    "            max_returns = max_cars - init_cars + new_rentals\n",
    "            returns_pmf = clipped_poisson(lambda_return, max_returns)\n",
    "            for returns in range(max_returns + 1):\n",
    "                p = returns_pmf[returns] * new_rentals_pmf[new_rentals]\n",
    "                pmf[init_cars, new_rentals, returns] = p\n",
    "                \n",
    "    return pmf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "class JacksWorld(object):\n",
    "    \"\"\"Environment model of Jack's Car Rental\"\"\"\n",
    "    def __init__(self, lambda_return1, lambda_return2,\n",
    "                 lambda_request1, lambda_request2, max_cars):\n",
    "        # pre-build the rentals/returns pmf for each location\n",
    "        self.rent_return_pmf = []\n",
    "        self.rent_return_pmf.append(build_rent_return_pmf(lambda_request1,\n",
    "                                                      lambda_return1,\n",
    "                                                      max_cars))\n",
    "        self.rent_return_pmf.append(build_rent_return_pmf(lambda_request2,\n",
    "                                                      lambda_return2,\n",
    "                                                      max_cars))\n",
    "        self.max_cars = max_cars\n",
    "        \n",
    "    def get_transition_model(self, s, a):\n",
    "        \"\"\"\n",
    "        Return 2-tuple:\n",
    "            1. p(s'| s, a) as dictionary:\n",
    "                keys = s'\n",
    "                values = p(s' | s, a)\n",
    "            2. E(r | s, a, s') as dictionary:\n",
    "                keys = s'\n",
    "                values = E(r | s, a, s')\n",
    "        \"\"\"\n",
    "        s = (s[0] - a, s[1] + a)         # move a cars from loc1 to loc2\n",
    "        move_reward = -math.fabs(a) * 2  # ($2) per car moved\n",
    "        t_prob, expected_r = ([{}, {}], [{}, {}])\n",
    "        for loc in range(2):\n",
    "            morning_cars = s[loc]\n",
    "            rent_return_pmf = self.rent_return_pmf[loc]\n",
    "            for rents in range(morning_cars + 1):\n",
    "                max_returns = self.max_cars - morning_cars + rents\n",
    "                for returns in range(max_returns + 1):\n",
    "                    p = rent_return_pmf[morning_cars, rents, returns]\n",
    "                    if p < 1e-5:\n",
    "                        continue\n",
    "                    s_prime = morning_cars - rents + returns\n",
    "                    r = rents * 10\n",
    "                    t_prob[loc][s_prime] = t_prob[loc].get(s_prime, 0) + p\n",
    "                    expected_r[loc][s_prime] = expected_r[loc].get(s_prime, 0) + p * r\n",
    "        \n",
    "        # join probabilities and expectations from loc1 and loc2\n",
    "        t_model, r_model = ({}, {})\n",
    "        for s_prime1 in t_prob[0]:\n",
    "            for s_prime2 in t_prob[1]:\n",
    "                p1 = t_prob[0][s_prime1]\n",
    "                p2 = t_prob[1][s_prime2]\n",
    "                t_model[(s_prime1, s_prime2)] = p1 * p2\n",
    "                # expectation of reward calculated using p(s', r | s, a)\n",
    "                # need to normalize by p(s' | s, a) to get E(r | s, a, s')\n",
    "                norm_E1 = expected_r[0][s_prime1] / p1\n",
    "                norm_E2 = expected_r[1][s_prime2] / p2\n",
    "                r_model[(s_prime1, s_prime2)] = norm_E1 + norm_E2 + move_reward\n",
    "                \n",
    "        return t_model, r_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Initialize environment\n",
    "\n",
    "max_cars = 20\n",
    "jacks = JacksWorld(lambda_return1=3, lambda_return2=2,\n",
    "                 lambda_request1=3, lambda_request2=4, max_cars=max_cars)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Policy at iteration k=0:\n",
      "[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]\n",
      "\n",
      "Policy evaluation k=0. Worst V(s) delta:\n",
      "Iteration 0: max delta = 193.25\n",
      "Iteration 1: max delta = 133.91\n",
      "Iteration 2: max delta = 90.37\n",
      "Iteration 3: max delta = 67.56\n",
      "Iteration 4: max delta = 53.54\n",
      "Iteration 5: max delta = 41.68\n",
      "Iteration 6: max delta = 32.68\n",
      "Iteration 7: max delta = 26.05\n",
      "Iteration 8: max delta = 21.76\n",
      "Iteration 9: max delta = 18.30\n",
      "Iteration 10: max delta = 15.35\n",
      "Iteration 11: max delta = 12.84\n",
      "Iteration 12: max delta = 10.72\n",
      "Iteration 13: max delta = 8.93\n",
      "Iteration 14: max delta = 7.43\n",
      "Iteration 15: max delta = 6.17\n",
      "Iteration 16: max delta = 5.11\n",
      "Iteration 17: max delta = 4.23\n",
      "Iteration 18: max delta = 3.50\n",
      "Iteration 19: max delta = 2.89\n",
      "Iteration 20: max delta = 2.39\n",
      "Iteration 21: max delta = 1.97\n",
      "Iteration 22: max delta = 1.62\n",
      "Iteration 23: max delta = 1.34\n",
      "Iteration 24: max delta = 1.10\n",
      "Iteration 25: max delta = 0.91\n",
      "Iteration 26: max delta = 0.74\n",
      "Iteration 27: max delta = 0.61\n",
      "Iteration 28: max delta = 0.50\n",
      "Iteration 29: max delta = 0.41\n",
      "\n",
      "Policy at iteration k=1:\n",
      "[[ 0  0  0  0  0  0  0  0 -1 -1 -1 -2 -2 -2 -2 -3 -3 -3 -3 -3 -4]\n",
      " [ 0  0  0  0  0  0  0  0  0  0 -1 -1 -1 -1 -2 -2 -2 -2 -2 -3 -3]\n",
      " [ 0  0  0  0  0  0  0  0  0  0  0  0  0 -1 -1 -1 -1 -1 -2 -2 -2]\n",
      " [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1 -1 -1 -1]\n",
      " [ 1  1  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1]\n",
      " [ 2  2  2  1  1  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 3  3  2  2  2  2  1  1  1  1  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 4  3  3  3  3  2  2  2  2  1  1  1  0  0  0  0  0  0  0  0  0]\n",
      " [ 4  4  4  4  3  3  3  3  2  2  2  1  1  1  1  0  0  0  0  0  0]\n",
      " [ 5  5  5  4  4  4  4  3  3  3  2  2  2  2  1  1  1  0  0  0  0]\n",
      " [ 5  5  5  5  5  5  4  4  4  3  3  3  3  2  2  2  1  1  0  0  0]\n",
      " [ 5  5  5  5  5  5  5  5  4  4  4  4  3  3  3  2  2  1  1  0  0]\n",
      " [ 5  5  5  5  5  5  5  5  5  5  5  4  4  4  3  3  2  2  1  0  0]\n",
      " [ 5  5  5  5  5  5  5  5  5  5  5  5  5  4  4  3  3  2  1  1  0]\n",
      " [ 5  5  5  5  5  5  5  5  5  5  5  5  5  5  4  4  3  2  2  1  0]\n",
      " [ 5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  4  3  3  2  1  0]\n",
      " [ 5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  4  4  3  2  1  0]\n",
      " [ 5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  4  3  2  1  0]\n",
      " [ 5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  4  3  2  1  0]\n",
      " [ 5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  4  3  2  1  0]\n",
      " [ 5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  4  3  2  1  0]]\n",
      "\n",
      "Policy evaluation k=1. Worst V(s) delta:\n",
      "Iteration 0: max delta = 64.72\n",
      "Iteration 1: max delta = 4.95\n",
      "Iteration 2: max delta = 2.26\n",
      "Iteration 3: max delta = 1.73\n",
      "Iteration 4: max delta = 1.49\n",
      "Iteration 5: max delta = 1.24\n",
      "Iteration 6: max delta = 1.02\n",
      "Iteration 7: max delta = 0.83\n",
      "Iteration 8: max delta = 0.68\n",
      "Iteration 9: max delta = 0.56\n",
      "Iteration 10: max delta = 0.46\n",
      "\n",
      "Policy at iteration k=2:\n",
      "[[ 0  0  0  0  0  0  0 -1 -1 -1 -2 -2 -2 -3 -3 -3 -3 -3 -3 -4 -4]\n",
      " [ 0  0  0  0  0  0  0  0  0 -1 -1 -1 -2 -2 -2 -2 -2 -2 -3 -3 -3]\n",
      " [ 0  0  0  0  0  0  0  0  0  0  0 -1 -1 -1 -1 -1 -1 -2 -2 -2 -2]\n",
      " [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1 -1 -1 -1 -1]\n",
      " [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1]\n",
      " [ 1  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 2  2  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 3  2  2  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 3  3  2  2  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 4  3  3  2  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 4  4  3  3  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  4  4  3  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  4  3  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  4  3  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  4  3  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  4  3  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  4  3  2  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  4  3  2  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  4  3  3  2  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  4  4  3  2  2  1  1  1  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  5  4  3  3  2  2  2  1  1  1  1  1  1  1  1  1  0  0  0]]\n",
      "\n",
      "Policy evaluation k=2. Worst V(s) delta:\n",
      "Iteration 0: max delta = 4.41\n",
      "Iteration 1: max delta = 2.99\n",
      "Iteration 2: max delta = 2.08\n",
      "Iteration 3: max delta = 1.51\n",
      "Iteration 4: max delta = 1.06\n",
      "Iteration 5: max delta = 0.73\n",
      "Iteration 6: max delta = 0.50\n",
      "\n",
      "Policy at iteration k=3:\n",
      "[[ 0  0  0  0  0  0  0  0 -1 -1 -2 -2 -2 -3 -3 -3 -3 -3 -4 -4 -4]\n",
      " [ 0  0  0  0  0  0  0  0  0 -1 -1 -1 -2 -2 -2 -2 -2 -3 -3 -3 -3]\n",
      " [ 0  0  0  0  0  0  0  0  0  0  0 -1 -1 -1 -1 -1 -2 -2 -2 -2 -2]\n",
      " [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1 -1 -1 -1 -1 -2]\n",
      " [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1 -1]\n",
      " [ 1  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 2  2  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 3  2  2  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 3  3  2  2  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 4  3  3  2  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 4  4  3  3  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  4  4  3  2  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  4  3  2  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  4  3  3  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  4  4  3  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  5  4  3  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  5  4  3  2  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  5  4  3  2  2  1  1  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  5  4  3  3  2  2  1  1  1  1  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  5  4  4  3  3  2  2  2  2  1  1  1  1  1  0  0  0  0  0]\n",
      " [ 5  5  5  5  4  4  3  3  3  3  2  2  2  2  2  1  1  1  0  0  0]]\n",
      "\n",
      "Policy evaluation k=3. Worst V(s) delta:\n",
      "Iteration 0: max delta = 0.70\n",
      "Iteration 1: max delta = 0.34\n",
      "\n",
      "Policy at iteration k=4:\n",
      "[[ 0  0  0  0  0  0  0  0 -1 -1 -2 -2 -2 -3 -3 -3 -3 -3 -4 -4 -4]\n",
      " [ 0  0  0  0  0  0  0  0  0 -1 -1 -1 -2 -2 -2 -2 -2 -3 -3 -3 -3]\n",
      " [ 0  0  0  0  0  0  0  0  0  0  0 -1 -1 -1 -1 -1 -2 -2 -2 -2 -2]\n",
      " [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1 -1 -1 -1 -1 -2]\n",
      " [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1 -1]\n",
      " [ 1  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 2  2  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 3  2  2  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 3  3  2  2  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 4  3  3  2  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 4  4  3  3  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  4  4  3  2  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  4  3  2  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  4  3  3  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  4  4  3  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  5  4  3  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  5  4  3  2  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  5  4  3  2  2  1  1  0  0  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  5  4  3  3  2  2  1  1  1  0  0  0  0  0  0  0  0  0  0]\n",
      " [ 5  5  5  4  4  3  3  2  2  2  1  1  1  1  1  0  0  0  0  0  0]\n",
      " [ 5  5  5  5  4  4  3  3  3  2  2  2  2  2  1  1  1  0  0  0  0]]\n",
      "\n",
      "Policy evaluation k=4. Worst V(s) delta:\n",
      "Iteration 0: max delta = 0.23\n",
      "\n",
      "--- 27.95 seconds ---\n"
     ]
    }
   ],
   "source": [
    "# Initialize value function and policy\n",
    "\n",
    "pi = np.zeros((max_cars+1, max_cars+1), dtype=np.int16)\n",
    "V = np.zeros((max_cars+1, max_cars+1))\n",
    "states = [(s0, s1) for s0 in range(max_cars+1) for s1 in range(max_cars+1)]\n",
    "gamma = 0.9\n",
    "\n",
    "start_time = time.time()\n",
    "# Policy Iteration (using iterative policy evaluation)\n",
    "for k in range(100):\n",
    "    print('\\nPolicy at iteration k={}:'.format(k))\n",
    "    print(pi)\n",
    "    print('\\nPolicy evaluation k={}. Worst V(s) delta:'.format(k))\n",
    "    # Policy Evaluation\n",
    "    theta = 0.5\n",
    "    for i in range(100):\n",
    "        delta = 0\n",
    "        for s in states:\n",
    "            v = V[s]\n",
    "            t_model, r_model = jacks.get_transition_model(s, pi[s])\n",
    "            v_new = 0\n",
    "            for s_prime in t_model:\n",
    "                p = t_model[s_prime]\n",
    "                r = r_model[s_prime]\n",
    "                v_new += p * (gamma * V[s_prime] + r)\n",
    "            V[s] = v_new\n",
    "            delta = max(delta, abs(v_new - v))\n",
    "        print('Iteration {}: max delta = {:.2f}'.format(i, delta))\n",
    "        if delta < theta: break\n",
    "\n",
    "    # Policy Iteration\n",
    "    stable = True\n",
    "    for s in states:\n",
    "        old_action = pi[s]\n",
    "        best_v = -1000\n",
    "        max_a = min(5, s[0], max_cars-s[1])\n",
    "        min_a = max(-5, -s[1], -(max_cars-s[0]))\n",
    "        for a in range(min_a, max_a+1):\n",
    "            t_model, r_model = jacks.get_transition_model(s, a)\n",
    "            v = 0\n",
    "            for s_prime in t_model:\n",
    "                p = t_model[s_prime]\n",
    "                r = r_model[s_prime]\n",
    "                v += p * (gamma * V[s_prime] + r)\n",
    "            if v > best_v:\n",
    "                pi[s] = a\n",
    "                best_v = v\n",
    "        if pi[s] != old_action:\n",
    "            stable = False\n",
    "    if stable: break\n",
    "\n",
    "print(\"\\n--- {:.2f} seconds ---\".format(time.time() - start_time))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Jack's Car Rental: Policy Iteration\n",
	"Example 4.2 from Reinforcement Learning: An Introduction by Sutton and Barto.\n",
	"\n",
	"This example from Sutton and Barto's excellent book is trickier to implement than you might think. I've built 2 versions: one using policy iteration, and the other with value iteration.\n",
	"\n",
	"For policy iteration, note how policy evaluation becomes shorter and shorter as the policy converges. It is also interesting to note that value iteration is slower than policy iteration in this example. Value iteration must check all possible actions from every state in each iteration.\n",
	"\n",
	"Both methods converge to the result shown in the textbook.\n",
	"\n",
	"Python Notebook by Patrick Coady: [Learning Artificial Intelligence](https://learningai.io/)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import math\n",
	"import time"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def clipped_poisson(lam, max_k):\n",
	" \"\"\"\n",
	" Return poisson PMF clipped at max_k with remaining tail probability\n",
	" placed at max_k.\n",
	" \"\"\"\n",
	" pmf = np.zeros(max_k + 1)\n",
	" for k in range(max_k):\n",
	" pmf[k] = math.exp(-lam) * lam**k / math.factorial(k)\n",
	" pmf[max_k] = 1 - np.sum(pmf)\n",
	" \n",
	" return pmf "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def build_rent_return_pmf(lambda_request, lambda_return, max_cars):\n",
	" \"\"\"\n",
	" Return p(new_rentals, returns \| initial_cars) as numpy array:\n",
	" p[initial_cars, new_rentals, returns]\n",
	" \"\"\"\n",
	" pmf = np.zeros((max_cars+1, max_cars+1, max_cars+1))\n",
	" \n",
	" for init_cars in range(max_cars + 1):\n",
	" new_rentals_pmf = clipped_poisson(lambda_request, init_cars)\n",
	" for new_rentals in range(init_cars + 1):\n",
	" max_returns = max_cars - init_cars + new_rentals\n",
	" returns_pmf = clipped_poisson(lambda_return, max_returns)\n",
	" for returns in range(max_returns + 1):\n",
	" p = returns_pmf[returns] * new_rentals_pmf[new_rentals]\n",
	" pmf[init_cars, new_rentals, returns] = p\n",
	" \n",
	" return pmf"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"class JacksWorld(object):\n",
	" \"\"\"Environment model of Jack's Car Rental\"\"\"\n",
	" def __init__(self, lambda_return1, lambda_return2,\n",
	" lambda_request1, lambda_request2, max_cars):\n",
	" # pre-build the rentals/returns pmf for each location\n",
	" self.rent_return_pmf = []\n",
	" self.rent_return_pmf.append(build_rent_return_pmf(lambda_request1,\n",
	" lambda_return1,\n",
	" max_cars))\n",
	" self.rent_return_pmf.append(build_rent_return_pmf(lambda_request2,\n",
	" lambda_return2,\n",
	" max_cars))\n",
	" self.max_cars = max_cars\n",
	" \n",
	" def get_transition_model(self, s, a):\n",
	" \"\"\"\n",
	" Return 2-tuple:\n",
	" 1. p(s'\| s, a) as dictionary:\n",
	" keys = s'\n",
	" values = p(s' \| s, a)\n",
	" 2. E(r \| s, a, s') as dictionary:\n",
	" keys = s'\n",
	" values = E(r \| s, a, s')\n",
	" \"\"\"\n",
	" s = (s[0] - a, s[1] + a) # move a cars from loc1 to loc2\n",
	" move_reward = -math.fabs(a) * 2 # ($2) per car moved\n",
	" t_prob, expected_r = ([{}, {}], [{}, {}])\n",
	" for loc in range(2):\n",
	" morning_cars = s[loc]\n",
	" rent_return_pmf = self.rent_return_pmf[loc]\n",
	" for rents in range(morning_cars + 1):\n",
	" max_returns = self.max_cars - morning_cars + rents\n",
	" for returns in range(max_returns + 1):\n",
	" p = rent_return_pmf[morning_cars, rents, returns]\n",
	" if p < 1e-5:\n",
	" continue\n",
	" s_prime = morning_cars - rents + returns\n",
	" r = rents * 10\n",
	" t_prob[loc][s_prime] = t_prob[loc].get(s_prime, 0) + p\n",
	" expected_r[loc][s_prime] = expected_r[loc].get(s_prime, 0) + p * r\n",
	" \n",
	" # join probabilities and expectations from loc1 and loc2\n",
	" t_model, r_model = ({}, {})\n",
	" for s_prime1 in t_prob[0]:\n",
	" for s_prime2 in t_prob[1]:\n",
	" p1 = t_prob[0][s_prime1]\n",
	" p2 = t_prob[1][s_prime2]\n",
	" t_model[(s_prime1, s_prime2)] = p1 * p2\n",
	" # expectation of reward calculated using p(s', r \| s, a)\n",
	" # need to normalize by p(s' \| s, a) to get E(r \| s, a, s')\n",
	" norm_E1 = expected_r[0][s_prime1] / p1\n",
	" norm_E2 = expected_r[1][s_prime2] / p2\n",
	" r_model[(s_prime1, s_prime2)] = norm_E1 + norm_E2 + move_reward\n",
	" \n",
	" return t_model, r_model"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Initialize environment\n",
	"\n",
	"max_cars = 20\n",
	"jacks = JacksWorld(lambda_return1=3, lambda_return2=2,\n",
	" lambda_request1=3, lambda_request2=4, max_cars=max_cars)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\n",
	"Policy at iteration k=0:\n",
	"[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]\n",
	"\n",
	"Policy evaluation k=0. Worst V(s) delta:\n",
	"Iteration 0: max delta = 193.25\n",
	"Iteration 1: max delta = 133.91\n",
	"Iteration 2: max delta = 90.37\n",
	"Iteration 3: max delta = 67.56\n",
	"Iteration 4: max delta = 53.54\n",
	"Iteration 5: max delta = 41.68\n",
	"Iteration 6: max delta = 32.68\n",
	"Iteration 7: max delta = 26.05\n",
	"Iteration 8: max delta = 21.76\n",
	"Iteration 9: max delta = 18.30\n",
	"Iteration 10: max delta = 15.35\n",
	"Iteration 11: max delta = 12.84\n",
	"Iteration 12: max delta = 10.72\n",
	"Iteration 13: max delta = 8.93\n",
	"Iteration 14: max delta = 7.43\n",
	"Iteration 15: max delta = 6.17\n",
	"Iteration 16: max delta = 5.11\n",
	"Iteration 17: max delta = 4.23\n",
	"Iteration 18: max delta = 3.50\n",
	"Iteration 19: max delta = 2.89\n",
	"Iteration 20: max delta = 2.39\n",
	"Iteration 21: max delta = 1.97\n",
	"Iteration 22: max delta = 1.62\n",
	"Iteration 23: max delta = 1.34\n",
	"Iteration 24: max delta = 1.10\n",
	"Iteration 25: max delta = 0.91\n",
	"Iteration 26: max delta = 0.74\n",
	"Iteration 27: max delta = 0.61\n",
	"Iteration 28: max delta = 0.50\n",
	"Iteration 29: max delta = 0.41\n",
	"\n",
	"Policy at iteration k=1:\n",
	"[[ 0 0 0 0 0 0 0 0 -1 -1 -1 -2 -2 -2 -2 -3 -3 -3 -3 -3 -4]\n",
	" [ 0 0 0 0 0 0 0 0 0 0 -1 -1 -1 -1 -2 -2 -2 -2 -2 -3 -3]\n",
	" [ 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 -1 -1 -1 -1 -2 -2 -2]\n",
	" [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 -1 -1 -1]\n",
	" [ 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1]\n",
	" [ 2 2 2 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 3 3 2 2 2 2 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 4 3 3 3 3 2 2 2 2 1 1 1 0 0 0 0 0 0 0 0 0]\n",
	" [ 4 4 4 4 3 3 3 3 2 2 2 1 1 1 1 0 0 0 0 0 0]\n",
	" [ 5 5 5 4 4 4 4 3 3 3 2 2 2 2 1 1 1 0 0 0 0]\n",
	" [ 5 5 5 5 5 5 4 4 4 3 3 3 3 2 2 2 1 1 0 0 0]\n",
	" [ 5 5 5 5 5 5 5 5 4 4 4 4 3 3 3 2 2 1 1 0 0]\n",
	" [ 5 5 5 5 5 5 5 5 5 5 5 4 4 4 3 3 2 2 1 0 0]\n",
	" [ 5 5 5 5 5 5 5 5 5 5 5 5 5 4 4 3 3 2 1 1 0]\n",
	" [ 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 4 3 2 2 1 0]\n",
	" [ 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 3 3 2 1 0]\n",
	" [ 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 4 3 2 1 0]\n",
	" [ 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 3 2 1 0]\n",
	" [ 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 3 2 1 0]\n",
	" [ 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 3 2 1 0]\n",
	" [ 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 3 2 1 0]]\n",
	"\n",
	"Policy evaluation k=1. Worst V(s) delta:\n",
	"Iteration 0: max delta = 64.72\n",
	"Iteration 1: max delta = 4.95\n",
	"Iteration 2: max delta = 2.26\n",
	"Iteration 3: max delta = 1.73\n",
	"Iteration 4: max delta = 1.49\n",
	"Iteration 5: max delta = 1.24\n",
	"Iteration 6: max delta = 1.02\n",
	"Iteration 7: max delta = 0.83\n",
	"Iteration 8: max delta = 0.68\n",
	"Iteration 9: max delta = 0.56\n",
	"Iteration 10: max delta = 0.46\n",
	"\n",
	"Policy at iteration k=2:\n",
	"[[ 0 0 0 0 0 0 0 -1 -1 -1 -2 -2 -2 -3 -3 -3 -3 -3 -3 -4 -4]\n",
	" [ 0 0 0 0 0 0 0 0 0 -1 -1 -1 -2 -2 -2 -2 -2 -2 -3 -3 -3]\n",
	" [ 0 0 0 0 0 0 0 0 0 0 0 -1 -1 -1 -1 -1 -1 -2 -2 -2 -2]\n",
	" [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 -1 -1 -1 -1]\n",
	" [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1]\n",
	" [ 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 2 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 3 2 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 3 3 2 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 4 3 3 2 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 4 4 3 3 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 4 4 3 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 4 3 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 4 3 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 4 3 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 4 3 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 4 3 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 4 3 2 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 4 3 3 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 4 4 3 2 2 1 1 1 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 5 4 3 3 2 2 2 1 1 1 1 1 1 1 1 1 0 0 0]]\n",
	"\n",
	"Policy evaluation k=2. Worst V(s) delta:\n",
	"Iteration 0: max delta = 4.41\n",
	"Iteration 1: max delta = 2.99\n",
	"Iteration 2: max delta = 2.08\n",
	"Iteration 3: max delta = 1.51\n",
	"Iteration 4: max delta = 1.06\n",
	"Iteration 5: max delta = 0.73\n",
	"Iteration 6: max delta = 0.50\n",
	"\n",
	"Policy at iteration k=3:\n",
	"[[ 0 0 0 0 0 0 0 0 -1 -1 -2 -2 -2 -3 -3 -3 -3 -3 -4 -4 -4]\n",
	" [ 0 0 0 0 0 0 0 0 0 -1 -1 -1 -2 -2 -2 -2 -2 -3 -3 -3 -3]\n",
	" [ 0 0 0 0 0 0 0 0 0 0 0 -1 -1 -1 -1 -1 -2 -2 -2 -2 -2]\n",
	" [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 -1 -1 -1 -1 -2]\n",
	" [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 -1]\n",
	" [ 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 2 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 3 2 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 3 3 2 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 4 3 3 2 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 4 4 3 3 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 4 4 3 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 4 3 2 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 4 3 3 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 4 4 3 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 5 4 3 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 5 4 3 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 5 4 3 2 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 5 4 3 3 2 2 1 1 1 1 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 5 4 4 3 3 2 2 2 2 1 1 1 1 1 0 0 0 0 0]\n",
	" [ 5 5 5 5 4 4 3 3 3 3 2 2 2 2 2 1 1 1 0 0 0]]\n",
	"\n",
	"Policy evaluation k=3. Worst V(s) delta:\n",
	"Iteration 0: max delta = 0.70\n",
	"Iteration 1: max delta = 0.34\n",
	"\n",
	"Policy at iteration k=4:\n",
	"[[ 0 0 0 0 0 0 0 0 -1 -1 -2 -2 -2 -3 -3 -3 -3 -3 -4 -4 -4]\n",
	" [ 0 0 0 0 0 0 0 0 0 -1 -1 -1 -2 -2 -2 -2 -2 -3 -3 -3 -3]\n",
	" [ 0 0 0 0 0 0 0 0 0 0 0 -1 -1 -1 -1 -1 -2 -2 -2 -2 -2]\n",
	" [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 -1 -1 -1 -1 -2]\n",
	" [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 -1]\n",
	" [ 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 2 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 3 2 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 3 3 2 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 4 3 3 2 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 4 4 3 3 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 4 4 3 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 4 3 2 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 4 3 3 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 4 4 3 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 5 4 3 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 5 4 3 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 5 4 3 2 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 5 4 3 3 2 2 1 1 1 0 0 0 0 0 0 0 0 0 0]\n",
	" [ 5 5 5 4 4 3 3 2 2 2 1 1 1 1 1 0 0 0 0 0 0]\n",
	" [ 5 5 5 5 4 4 3 3 3 2 2 2 2 2 1 1 1 0 0 0 0]]\n",
	"\n",
	"Policy evaluation k=4. Worst V(s) delta:\n",
	"Iteration 0: max delta = 0.23\n",
	"\n",
	"--- 27.95 seconds ---\n"
	]
	}
	],
	"source": [
	"# Initialize value function and policy\n",
	"\n",
	"pi = np.zeros((max_cars+1, max_cars+1), dtype=np.int16)\n",
	"V = np.zeros((max_cars+1, max_cars+1))\n",
	"states = [(s0, s1) for s0 in range(max_cars+1) for s1 in range(max_cars+1)]\n",
	"gamma = 0.9\n",
	"\n",
	"start_time = time.time()\n",
	"# Policy Iteration (using iterative policy evaluation)\n",
	"for k in range(100):\n",
	" print('\\nPolicy at iteration k={}:'.format(k))\n",
	" print(pi)\n",
	" print('\\nPolicy evaluation k={}. Worst V(s) delta:'.format(k))\n",
	" # Policy Evaluation\n",
	" theta = 0.5\n",
	" for i in range(100):\n",
	" delta = 0\n",
	" for s in states:\n",
	" v = V[s]\n",
	" t_model, r_model = jacks.get_transition_model(s, pi[s])\n",
	" v_new = 0\n",
	" for s_prime in t_model:\n",
	" p = t_model[s_prime]\n",
	" r = r_model[s_prime]\n",
	" v_new += p * (gamma * V[s_prime] + r)\n",
	" V[s] = v_new\n",
	" delta = max(delta, abs(v_new - v))\n",
	" print('Iteration {}: max delta = {:.2f}'.format(i, delta))\n",
	" if delta < theta: break\n",
	"\n",
	" # Policy Iteration\n",
	" stable = True\n",
	" for s in states:\n",
	" old_action = pi[s]\n",
	" best_v = -1000\n",
	" max_a = min(5, s[0], max_cars-s[1])\n",
	" min_a = max(-5, -s[1], -(max_cars-s[0]))\n",
	" for a in range(min_a, max_a+1):\n",
	" t_model, r_model = jacks.get_transition_model(s, a)\n",
	" v = 0\n",
	" for s_prime in t_model:\n",
	" p = t_model[s_prime]\n",
	" r = r_model[s_prime]\n",
	" v += p * (gamma * V[s_prime] + r)\n",
	" if v > best_v:\n",
	" pi[s] = a\n",
	" best_v = v\n",
	" if pi[s] != old_action:\n",
	" stable = False\n",
	" if stable: break\n",
	"\n",
	"print(\"\\n--- {:.2f} seconds ---\".format(time.time() - start_time))"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}