denny0323/0. FrozenLake-v0(Q-Table_detail).ipynb

## 0. FrozenLake-v0(Q-Table_detail).ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Q-Table Learning in FrozenLake(detail)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "from OmpenAI gymm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import package\n",
    "import gym\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "env = gym.make('FrozenLake-v0')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "FrozenLake env review)\n",
    "\n",
    "* Structure : a 4 X 4 grid of blocks.\n",
    "* Blocks of State : Start / Goal / Safe-Frozen / Dangerous hole.\n",
    "* Objective : To have an agent learn to navigate from Start Block to Goal Block without moving onto a hole.\n",
    "* The Catch : A wind which occasionally blows the agent onto a space, against its will."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Q-Table Learning Algorithm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Initialize table with all zeros\n",
    "Q = np.zeros([env.observation_space.n, env.action_space.n])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.]])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Q"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Q -Table of FrozenLake env. has a 16 X 4 grid of shape. \n",
    "(one for each block : 16) X (Action : 4) // up, down, left or right"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\u001b[41mS\u001b[0mFFF\n",
      "FHFH\n",
      "FFFH\n",
      "HFFG\n",
      "------------------------\n",
      "  (Left)\n",
      "\u001b[41mS\u001b[0mFFF\n",
      "FHFH\n",
      "FFFH\n",
      "HFFG\n",
      "action: 0\n",
      "state : 0 \n",
      "info : 0.333\n",
      "------------------------\n",
      "  (Down)\n",
      "S\u001b[41mF\u001b[0mFF\n",
      "FHFH\n",
      "FFFH\n",
      "HFFG\n",
      "action: 1\n",
      "state : 1 \n",
      "info : 0.333\n",
      "------------------------\n",
      "  (Right)\n",
      "SFFF\n",
      "\u001b[41mF\u001b[0mHFH\n",
      "FFFH\n",
      "HFFG\n",
      "action: 2\n",
      "state : 4 \n",
      "info : 0.333\n",
      "------------------------\n",
      "  (Up)\n",
      "S\u001b[41mF\u001b[0mFF\n",
      "FHFH\n",
      "FFFH\n",
      "HFFG\n",
      "action: 3\n",
      "state : 1 \n",
      "info : 0.333\n",
      "------------------------\n"
     ]
    }
   ],
   "source": [
    "observation = env.reset()\n",
    "env.render()\n",
    "print('------------------------')\n",
    "for i in range(env.action_space.n):\n",
    "    action = i\n",
    "    observation, reward, done, info = env.step(action)\n",
    "    env.render()\n",
    "    print('action: %d' %action)\n",
    "    print('state : %d \\ninfo : %.3f' %(observation,info['prob']))\n",
    "    print('------------------------')\n",
    "    observation = env.reset()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "as above result, we can know\n",
    "* action 0 : Left\n",
    "* action 1 : Down\n",
    "* action 2 : Right\n",
    "* action 3 : UP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set learning parameters\n",
    "lr = .8   # learning rate\n",
    "y = .95   # discount factor\n",
    "num_episodes = 2000\n",
    "\n",
    "#create lists to contain total rewards and steps per episode\n",
    "rList = [] # reword list\n",
    "# sList = [] # state list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for i in range(num_episodes):\n",
    "    s = env.reset() # Reset environment and get first new observation\n",
    "    rAll = 0        # total reward\n",
    "    d = False       # end of precess\n",
    "    j = 0           # step\n",
    "#     print('------<initial state>-------')\n",
    "#     env.render()\n",
    "#     print('state : %d \\ninfo : %.3f' %(s,info['prob']))\n",
    "#     print('----------------------------')\n",
    "    #The Q-Table learning algorithm\n",
    "    while j < 100:\n",
    "        j+=1\n",
    "        \n",
    "        # Choose an action by greedily (with noise) picking from Q table\n",
    "        # 1/ (i+1) factor has the effect of cutting back on randomness\n",
    "        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))  \n",
    "        \n",
    "        #Get new state and reward of action an agent did from environment\n",
    "        s1, r, d, info = env.step(a)\n",
    "        \n",
    "        # Update Q-Table with new knowledge(=reward)\n",
    "        Q[s,a] = Q[s,a] + lr*(r + y*np.max(Q[s1,:]) - Q[s,a]) \n",
    "        \n",
    "        rAll += r # add reward \n",
    "        s = s1    # move to next state\n",
    "\n",
    "        # Check some conditions in console\n",
    "#         env.render()\n",
    "#         print('step : ')\n",
    "#         print('action: %d' %a)\n",
    "#         print('state : %d \\ninfo : %.3f' %(s,info['prob']))\n",
    "#         print('----------------------------')\n",
    "        \n",
    "        # check the end of process\n",
    "        if d == True:\n",
    "#             if rAll == 1:\n",
    "#                 print('Arrive at goal State!\\n')\n",
    "\n",
    "#             else:\n",
    "#                 print('Arrive at hole. T.T\\n')\n",
    "            break\n",
    "    #jList.append(j)\n",
    "    rList.append(rAll)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Score over time: 0.585\n"
     ]
    }
   ],
   "source": [
    "print (\"Score over time: \" +  str(sum(rList)/num_episodes))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Final Q-Table Values\n",
      "[[ 0.103  0.008  0.006  0.006]\n",
      " [ 0.001  0.001  0.003  0.35 ]\n",
      " [ 0.003  0.003  0.008  0.104]\n",
      " [ 0.     0.     0.001  0.074]\n",
      " [ 0.17   0.     0.001  0.001]\n",
      " [ 0.     0.     0.     0.   ]\n",
      " [ 0.114  0.     0.     0.   ]\n",
      " [ 0.     0.     0.     0.   ]\n",
      " [ 0.001  0.     0.003  0.409]\n",
      " [ 0.002  0.683  0.     0.   ]\n",
      " [ 0.9    0.     0.     0.002]\n",
      " [ 0.     0.     0.     0.   ]\n",
      " [ 0.     0.     0.     0.   ]\n",
      " [ 0.     0.     0.945  0.001]\n",
      " [ 0.     0.995  0.     0.   ]\n",
      " [ 0.     0.     0.     0.   ]]\n"
     ]
    }
   ],
   "source": [
    "print (\"Final Q-Table Values\")\n",
    "print (np.round(Q,3))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let me check more detail.\n",
    "I'll get the index of the maximum prob. in every row, then translate to action."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "at state, the agent move : \n",
      "['Left', 'Up', 'Up', 'Up', 'Left', 'hole or goal state', 'Left', 'hole or goal state', 'Up', 'Down', 'Left', 'hole or goal state', 'hole or goal state', 'Right', 'Down', 'hole or goal state']\n"
     ]
    }
   ],
   "source": [
    "action_at_state=[]\n",
    "action_set=['Left', 'Down', 'Right', 'Up']\n",
    "for i in range(len(Q)):\n",
    "    if np.sum(Q[i]) == 0:\n",
    "        action_at_state.append('hole or goal state')\n",
    "    else:\n",
    "        idx=np.argmax(Q[i])\n",
    "        action_at_state.append(action_set[idx])\n",
    "print (\"at state, the agent move : \")\n",
    "print (action_at_state)"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "gist_id": "b8c5cb850730ff671a5a63ac9c9f5991",
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Q-Table Learning in FrozenLake(detail)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"from OmpenAI gymm"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"# import package\n",
	"import gym\n",
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"env = gym.make('FrozenLake-v0')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"FrozenLake env review)\n",
	"\n",
	"* Structure : a 4 X 4 grid of blocks.\n",
	"* Blocks of State : Start / Goal / Safe-Frozen / Dangerous hole.\n",
	"* Objective : To have an agent learn to navigate from Start Block to Goal Block without moving onto a hole.\n",
	"* The Catch : A wind which occasionally blows the agent onto a space, against its will."
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Q-Table Learning Algorithm"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"#Initialize table with all zeros\n",
	"Q = np.zeros([env.observation_space.n, env.action_space.n])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[ 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0.]])"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"Q"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Q -Table of FrozenLake env. has a 16 X 4 grid of shape. \n",
	"(one for each block : 16) X (Action : 4) // up, down, left or right"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\n",
	"\u001b[41mS\u001b[0mFFF\n",
	"FHFH\n",
	"FFFH\n",
	"HFFG\n",
	"------------------------\n",
	" (Left)\n",
	"\u001b[41mS\u001b[0mFFF\n",
	"FHFH\n",
	"FFFH\n",
	"HFFG\n",
	"action: 0\n",
	"state : 0 \n",
	"info : 0.333\n",
	"------------------------\n",
	" (Down)\n",
	"S\u001b[41mF\u001b[0mFF\n",
	"FHFH\n",
	"FFFH\n",
	"HFFG\n",
	"action: 1\n",
	"state : 1 \n",
	"info : 0.333\n",
	"------------------------\n",
	" (Right)\n",
	"SFFF\n",
	"\u001b[41mF\u001b[0mHFH\n",
	"FFFH\n",
	"HFFG\n",
	"action: 2\n",
	"state : 4 \n",
	"info : 0.333\n",
	"------------------------\n",
	" (Up)\n",
	"S\u001b[41mF\u001b[0mFF\n",
	"FHFH\n",
	"FFFH\n",
	"HFFG\n",
	"action: 3\n",
	"state : 1 \n",
	"info : 0.333\n",
	"------------------------\n"
	]
	}
	],
	"source": [
	"observation = env.reset()\n",
	"env.render()\n",
	"print('------------------------')\n",
	"for i in range(env.action_space.n):\n",
	" action = i\n",
	" observation, reward, done, info = env.step(action)\n",
	" env.render()\n",
	" print('action: %d' %action)\n",
	" print('state : %d \\ninfo : %.3f' %(observation,info['prob']))\n",
	" print('------------------------')\n",
	" observation = env.reset()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"as above result, we can know\n",
	"* action 0 : Left\n",
	"* action 1 : Down\n",
	"* action 2 : Right\n",
	"* action 3 : UP"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Set learning parameters\n",
	"lr = .8 # learning rate\n",
	"y = .95 # discount factor\n",
	"num_episodes = 2000\n",
	"\n",
	"#create lists to contain total rewards and steps per episode\n",
	"rList = [] # reword list\n",
	"# sList = [] # state list"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"scrolled": false
	},
	"outputs": [],
	"source": [
	"for i in range(num_episodes):\n",
	" s = env.reset() # Reset environment and get first new observation\n",
	" rAll = 0 # total reward\n",
	" d = False # end of precess\n",
	" j = 0 # step\n",
	"# print('------<initial state>-------')\n",
	"# env.render()\n",
	"# print('state : %d \\ninfo : %.3f' %(s,info['prob']))\n",
	"# print('----------------------------')\n",
	" #The Q-Table learning algorithm\n",
	" while j < 100:\n",
	" j+=1\n",
	" \n",
	" # Choose an action by greedily (with noise) picking from Q table\n",
	" # 1/ (i+1) factor has the effect of cutting back on randomness\n",
	" a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1))) \n",
	" \n",
	" #Get new state and reward of action an agent did from environment\n",
	" s1, r, d, info = env.step(a)\n",
	" \n",
	" # Update Q-Table with new knowledge(=reward)\n",
	" Q[s,a] = Q[s,a] + lr(r + ynp.max(Q[s1,:]) - Q[s,a]) \n",
	" \n",
	" rAll += r # add reward \n",
	" s = s1 # move to next state\n",
	"\n",
	" # Check some conditions in console\n",
	"# env.render()\n",
	"# print('step : ')\n",
	"# print('action: %d' %a)\n",
	"# print('state : %d \\ninfo : %.3f' %(s,info['prob']))\n",
	"# print('----------------------------')\n",
	" \n",
	" # check the end of process\n",
	" if d == True:\n",
	"# if rAll == 1:\n",
	"# print('Arrive at goal State!\\n')\n",
	"\n",
	"# else:\n",
	"# print('Arrive at hole. T.T\\n')\n",
	" break\n",
	" #jList.append(j)\n",
	" rList.append(rAll)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Score over time: 0.585\n"
	]
	}
	],
	"source": [
	"print (\"Score over time: \" + str(sum(rList)/num_episodes))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Final Q-Table Values\n",
	"[[ 0.103 0.008 0.006 0.006]\n",
	" [ 0.001 0.001 0.003 0.35 ]\n",
	" [ 0.003 0.003 0.008 0.104]\n",
	" [ 0. 0. 0.001 0.074]\n",
	" [ 0.17 0. 0.001 0.001]\n",
	" [ 0. 0. 0. 0. ]\n",
	" [ 0.114 0. 0. 0. ]\n",
	" [ 0. 0. 0. 0. ]\n",
	" [ 0.001 0. 0.003 0.409]\n",
	" [ 0.002 0.683 0. 0. ]\n",
	" [ 0.9 0. 0. 0.002]\n",
	" [ 0. 0. 0. 0. ]\n",
	" [ 0. 0. 0. 0. ]\n",
	" [ 0. 0. 0.945 0.001]\n",
	" [ 0. 0.995 0. 0. ]\n",
	" [ 0. 0. 0. 0. ]]\n"
	]
	}
	],
	"source": [
	"print (\"Final Q-Table Values\")\n",
	"print (np.round(Q,3))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Let me check more detail.\n",
	"I'll get the index of the maximum prob. in every row, then translate to action."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"at state, the agent move : \n",
	"['Left', 'Up', 'Up', 'Up', 'Left', 'hole or goal state', 'Left', 'hole or goal state', 'Up', 'Down', 'Left', 'hole or goal state', 'hole or goal state', 'Right', 'Down', 'hole or goal state']\n"
	]
	}
	],
	"source": [
	"action_at_state=[]\n",
	"action_set=['Left', 'Down', 'Right', 'Up']\n",
	"for i in range(len(Q)):\n",
	" if np.sum(Q[i]) == 0:\n",
	" action_at_state.append('hole or goal state')\n",
	" else:\n",
	" idx=np.argmax(Q[i])\n",
	" action_at_state.append(action_set[idx])\n",
	"print (\"at state, the agent move : \")\n",
	"print (action_at_state)"
	]
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"gist_id": "b8c5cb850730ff671a5a63ac9c9f5991",
	"kernelspec": {
	"display_name": "Python [default]",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}