Skip to content

Instantly share code, notes, and snippets.

@denny0323
Created January 26, 2018 05:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save denny0323/b833c6c6560436338bbd469f4301c0af to your computer and use it in GitHub Desktop.
Save denny0323/b833c6c6560436338bbd469f4301c0af to your computer and use it in GitHub Desktop.
3_SARSA
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## SARSA(On-Policy TD Control)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"same in \"2_MC Control agent.ipynb\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"class Env:\n",
" def __init__(self):\n",
" self.grid_width = 5\n",
" self.grid_height = self.grid_width\n",
" self.action_grid = [(-1, 0), (1, 0), (0, -1), (0, 1)] # U, D, L, R\n",
" self.gtriangle1 = [1, 2]\n",
" self.gtriangle2 = [2, 1]\n",
" self.goal = [2, 2]\n",
" \n",
" def step(self, state, action):\n",
" x, y = state\n",
" \n",
" # get next state by action\n",
" x+= action[0]\n",
" y+= action[1]\n",
" \n",
" if x < 0 :\n",
" x = 0\n",
" elif x > (self.grid_width-1) :\n",
" x = (self.grid_width-1)\n",
"\n",
" if y < 0 :\n",
" y = 0\n",
" elif y > (self.grid_width-1) :\n",
" y = (self.grid_width-1)\n",
" \n",
" next_state = [x, y]\n",
" \n",
" # reward \n",
" if next_state == self.gtriangle1 or next_state == self.gtriangle2:\n",
" reward = -1\n",
" done = True\n",
" elif next_state == self.goal:\n",
" reward = 1\n",
" done = True\n",
" else:\n",
" reward = 0\n",
" done = False\n",
" \n",
" return next_state, reward, done\n",
" \n",
" def reset(self):\n",
" return [0, 0]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"class SARSA_agent:\n",
" def __init__(self):\n",
" self.action_grid = [(-1, 0), (1, 0), (0, -1), (0, 1)]\n",
" self.action_text= ['U', 'D', 'L', 'R']\n",
" self.grid_width = 5\n",
" self.grid_height = self.grid_width\n",
" self.Qtable = np.zeros((self.grid_width, self.grid_height, len(self.action_grid)))\n",
" self.e = .1\n",
" self.learning_rate = .01\n",
" self.discount_factor = .95\n",
" self.memory=[]\n",
" \n",
" def get_action(self, state):\n",
" # with prob.ε take random action\n",
" if np.random.randn() < self.e :\n",
" idx = np.random.choice(len(self.action_grid),1)[0]\n",
" else :\n",
" Qvalues = self.Qtable[tuple(state)]\n",
" maxQ = np.amax(Qvalues)\n",
" tie_Qchecker = np.where(Qvalues==maxQ)[0]\n",
" \n",
" # if tie max value, get random\n",
" if len(tie_Qchecker) > 1:\n",
" idx = np.random.choice(tie_Qchecker, 1)[0]\n",
" else :\n",
" idx = np.argmax(Qvalues)\n",
" \n",
" action = self.action_grid[idx]\n",
" return action \n",
" \n",
" # using First visit MC \n",
" def update(self, state, action, reward, next_state, next_action):\n",
" action_idx = self.action_grid.index(action)\n",
" next_action_idx = self.action_grid.index(next_action)\n",
" current_Q = self.Qtable[tuple(state)][action_idx]\n",
" next_Q = self.Qtable[tuple(next_state)][next_action_idx]\n",
" updated_Q = current_Q + self.learning_rate*((reward + self.discount_factor*next_Q)-current_Q)\n",
" self.Qtable[tuple(state)][action_idx] = updated_Q\n",
" \n",
" def save_actionseq(self, action_sequence, action):\n",
" idx = self.action_grid.index(action)\n",
" action_sequence.append(self.action_text[idx])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"finished at [1, 2]\n",
"episode :0, The number of step:0\n",
" The sequence of action is: ['R', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :100, The number of step:0\n",
" The sequence of action is: ['L', 'L', 'L', 'L', 'U', 'L', 'L', 'U', 'L', 'L', 'R', 'R', 'L', 'D', 'L', 'U', 'D', 'D', 'L', 'U', 'U', 'U', 'D', 'R', 'L', 'U', 'U', 'U', 'R', 'L', 'D', 'R', 'L', 'L', 'U', 'R', 'L', 'U', 'U', 'U', 'L', 'U', 'L', 'L', 'U', 'L', 'U', 'L', 'L', 'D', 'D', 'U', 'U', 'D', 'D', 'L', 'U', 'U', 'U', 'L', 'R', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :200, The number of step:0\n",
" The sequence of action is: ['R', 'L', 'L', 'R', 'L', 'U', 'U', 'U', 'D', 'U', 'U', 'D', 'D', 'D', 'D', 'U', 'D', 'U', 'R', 'L', 'D', 'R', 'R', 'U', 'L', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :300, The number of step:0\n",
" The sequence of action is: ['L', 'D', 'U', 'R', 'U', 'L', 'D', 'L', 'U', 'L', 'L', 'D', 'U', 'L', 'R', 'L', 'U', 'R', 'U', 'L', 'U', 'L', 'R', 'L', 'U', 'D', 'U', 'U', 'U', 'L', 'D', 'U', 'U', 'U', 'U', 'L', 'U', 'U', 'U', 'U', 'U', 'L', 'R', 'R', 'R', 'D', 'L']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :400, The number of step:0\n",
" The sequence of action is: ['L', 'L', 'R', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :500, The number of step:0\n",
" The sequence of action is: ['R', 'L', 'L', 'D', 'U', 'L', 'L', 'D', 'U', 'U', 'U', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'D', 'R', 'U', 'U', 'L', 'L', 'L', 'L', 'U', 'U', 'L', 'U', 'U', 'U', 'L', 'L', 'D', 'U', 'U', 'L', 'L', 'R', 'R', 'R', 'D', 'D', 'L']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :600, The number of step:0\n",
" The sequence of action is: ['U', 'R', 'L', 'U', 'R', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'U', 'U', 'U', 'L', 'R', 'L', 'L', 'U', 'U', 'U', 'U', 'R', 'U', 'R', 'R', 'D', 'D', 'D', 'L', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :700, The number of step:0\n",
" The sequence of action is: ['U', 'R', 'L', 'L', 'L', 'U', 'L', 'D', 'U', 'U', 'R', 'L', 'U', 'U', 'U', 'U', 'R', 'U', 'R', 'R', 'D', 'R', 'L', 'R', 'D', 'L', 'R', 'D', 'U', 'L', 'L']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :800, The number of step:0\n",
" The sequence of action is: ['R', 'R', 'R', 'D', 'D', 'L']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :900, The number of step:0\n",
" The sequence of action is: ['R', 'L', 'L', 'U', 'U', 'U', 'D', 'U', 'R', 'R', 'R', 'U', 'R', 'D', 'U', 'D', 'D', 'L', 'D', 'L', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :1000, The number of step:0\n",
" The sequence of action is: ['R', 'L', 'U', 'U', 'D', 'U', 'U', 'U', 'U', 'D', 'U', 'U', 'L', 'L', 'U', 'U', 'R', 'D', 'U', 'L', 'U', 'R', 'L', 'U', 'R', 'D', 'U', 'U', 'D', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :1100, The number of step:0\n",
" The sequence of action is: ['R', 'L', 'U', 'D', 'D', 'D', 'R', 'R', 'L', 'R', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :1200, The number of step:0\n",
" The sequence of action is: ['U', 'U', 'U', 'U', 'R', 'L', 'R', 'L', 'L', 'U', 'D', 'U', 'R', 'D', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :1300, The number of step:0\n",
" The sequence of action is: ['U', 'R', 'L', 'U', 'D', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :1400, The number of step:0\n",
" The sequence of action is: ['U', 'L', 'L', 'L', 'U', 'U', 'U', 'U', 'R', 'L', 'U', 'U', 'L', 'U', 'U', 'L', 'R', 'L', 'U', 'U', 'U', 'L', 'U', 'L', 'U', 'U', 'U', 'L', 'R', 'R', 'L', 'L', 'U', 'L', 'L', 'L', 'D', 'D', 'U', 'U', 'D', 'L', 'U', 'U', 'L', 'L', 'L', 'U', 'U', 'R', 'L', 'L', 'D', 'R', 'U', 'L', 'D', 'R', 'U', 'U', 'L', 'R', 'U', 'L', 'D', 'D', 'U', 'R', 'U', 'U', 'U', 'L', 'D', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :1500, The number of step:0\n",
" The sequence of action is: ['U', 'U', 'R', 'L', 'U', 'L', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'D', 'D', 'D', 'U', 'L', 'D', 'L', 'R', 'L', 'L', 'R', 'L', 'R', 'R', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [1, 2]\n",
"episode :1600, The number of step:0\n",
" The sequence of action is: ['D', 'U', 'L', 'L', 'D', 'L', 'R', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :1700, The number of step:0\n",
" The sequence of action is: ['L', 'U', 'L', 'U', 'R', 'L', 'U', 'L', 'U', 'U', 'U', 'L', 'U', 'R', 'R', 'R', 'U', 'D', 'D', 'U', 'U', 'D', 'D', 'R', 'L', 'L']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :1800, The number of step:0\n",
" The sequence of action is: ['R', 'U', 'R', 'L', 'R', 'L', 'R', 'R', 'D', 'R', 'D', 'L', 'L']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :1900, The number of step:0\n",
" The sequence of action is: ['U', 'U', 'U', 'L', 'L', 'L', 'R', 'D', 'U', 'R', 'L', 'L', 'D', 'D', 'D', 'R', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :2000, The number of step:0\n",
" The sequence of action is: ['L', 'U', 'R', 'L', 'L', 'U', 'D', 'R', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :2100, The number of step:0\n",
" The sequence of action is: ['U', 'L', 'L', 'R', 'L', 'U', 'R', 'L', 'U', 'D', 'D', 'U', 'D', 'U', 'R', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :2200, The number of step:0\n",
" The sequence of action is: ['U', 'R', 'D', 'U', 'U', 'L', 'R', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :2300, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'D', 'L', 'L', 'D', 'R', 'L', 'R', 'L', 'R', 'R', 'U', 'D', 'L', 'R', 'R', 'D', 'R', 'U', 'U', 'L', 'L']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :2400, The number of step:0\n",
" The sequence of action is: ['R', 'R', 'R', 'D', 'D', 'L']\n",
"The total reward is: 1\n",
"\n",
"finished at [1, 2]\n",
"episode :2500, The number of step:0\n",
" The sequence of action is: ['R', 'R', 'U', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :2600, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'D', 'D', 'R', 'R', 'L', 'R', 'U', 'R', 'L', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :2700, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'D', 'U', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :2800, The number of step:0\n",
" The sequence of action is: ['L', 'U', 'D', 'D', 'D', 'L', 'R', 'L', 'L', 'U', 'L', 'D', 'R', 'R', 'L', 'R', 'L', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :2900, The number of step:0\n",
" The sequence of action is: ['D', 'R', 'L', 'D', 'D', 'R', 'L', 'R', 'R', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :3000, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'D', 'L', 'R', 'R', 'R', 'L', 'R', 'U', 'L']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :3100, The number of step:0\n",
" The sequence of action is: ['U', 'R', 'L', 'R', 'L', 'R', 'L', 'R', 'L', 'U', 'U', 'U', 'D', 'R', 'L', 'L', 'U', 'D', 'U', 'D', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :3200, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'D', 'R', 'L', 'U', 'D', 'R', 'R', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :3300, The number of step:0\n",
" The sequence of action is: ['U', 'U', 'R', 'D', 'L', 'D', 'D', 'R', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :3400, The number of step:0\n",
" The sequence of action is: ['U', 'D', 'D', 'L', 'D', 'R', 'R', 'L', 'R', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [1, 2]\n",
"episode :3500, The number of step:0\n",
" The sequence of action is: ['U', 'U', 'U', 'U', 'R', 'R', 'R', 'L', 'R', 'R', 'L', 'U', 'D', 'L']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :3600, The number of step:0\n",
" The sequence of action is: ['U', 'L', 'L', 'R', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :3700, The number of step:0\n",
" The sequence of action is: ['U', 'R', 'R', 'R', 'R', 'D', 'U', 'L', 'D', 'D', 'U', 'D', 'R', 'U', 'D', 'L', 'U', 'D', 'L']\n",
"The total reward is: 1\n",
"\n",
"finished at [1, 2]\n",
"episode :3800, The number of step:0\n",
" The sequence of action is: ['U', 'U', 'L', 'L', 'U', 'U', 'L', 'U', 'U', 'R', 'D', 'U', 'U', 'R', 'R', 'D', 'R', 'U', 'D', 'L', 'L']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :3900, The number of step:0\n",
" The sequence of action is: ['U', 'R', 'R', 'U', 'R', 'D', 'D', 'R', 'L', 'L']\n",
"The total reward is: 1\n",
"\n",
"finished at [1, 2]\n",
"episode :4000, The number of step:0\n",
" The sequence of action is: ['U', 'U', 'R', 'R', 'R', 'D', 'U', 'D', 'L']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :4100, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'U', 'L', 'R', 'L', 'D', 'D', 'R', 'L', 'L', 'U', 'D', 'U', 'D', 'R', 'R', 'D', 'U', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :4200, The number of step:0\n",
" The sequence of action is: ['L', 'U', 'D', 'U', 'U', 'U', 'U', 'U', 'U', 'L', 'U', 'U', 'U', 'U', 'R', 'R', 'R', 'D', 'D', 'L']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :4300, The number of step:0\n",
" The sequence of action is: ['U', 'D', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :4400, The number of step:0\n",
" The sequence of action is: ['U', 'D', 'U', 'D', 'L', 'U', 'U', 'U', 'U', 'L', 'L', 'R', 'R', 'R', 'D', 'D', 'L']\n",
"The total reward is: 1\n",
"\n",
"finished at [1, 2]\n",
"episode :4500, The number of step:0\n",
" The sequence of action is: ['L', 'U', 'R', 'R', 'L', 'L', 'D', 'U', 'U', 'U', 'R', 'R', 'R', 'L', 'R', 'L', 'R', 'U', 'D', 'D', 'U', 'L']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :4600, The number of step:0\n",
" The sequence of action is: ['L', 'R', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :4700, The number of step:0\n",
" The sequence of action is: ['D', 'U', 'L', 'R', 'R', 'R', 'D', 'U', 'D', 'D', 'D', 'U', 'L']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :4800, The number of step:0\n",
" The sequence of action is: ['U', 'U', 'D', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :4900, The number of step:0\n",
" The sequence of action is: ['U', 'U', 'U', 'L', 'U', 'U', 'U', 'U', 'R', 'R', 'U', 'U', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :5000, The number of step:0\n",
" The sequence of action is: ['U', 'R', 'R', 'R', 'D', 'D', 'L']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :5100, The number of step:0\n",
" The sequence of action is: ['L', 'U', 'L', 'L', 'L', 'D', 'D', 'U', 'D', 'D', 'R', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :5200, The number of step:0\n",
" The sequence of action is: ['L', 'D', 'D', 'U', 'L', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :5300, The number of step:0\n",
" The sequence of action is: ['U', 'D', 'D', 'D', 'D', 'R', 'R', 'U', 'R', 'L', 'L', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :5400, The number of step:0\n",
" The sequence of action is: ['U', 'U', 'U', 'L', 'L', 'D', 'L', 'U', 'L', 'L', 'D', 'L', 'D', 'D', 'R', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :5500, The number of step:0\n",
" The sequence of action is: ['U', 'D', 'R', 'L', 'U', 'D', 'D', 'D', 'R', 'R', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :5600, The number of step:0\n",
" The sequence of action is: ['L', 'U', 'D', 'D', 'D', 'D', 'R', 'R', 'L', 'U', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :5700, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'D', 'U', 'D', 'R', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :5800, The number of step:0\n",
" The sequence of action is: ['U', 'D', 'L', 'R', 'L', 'U', 'D', 'L', 'D', 'D', 'U', 'D', 'L', 'R', 'R', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :5900, The number of step:0\n",
" The sequence of action is: ['D', 'U', 'R', 'R', 'R', 'D', 'R', 'D', 'U', 'D', 'D', 'U', 'L', 'L']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :6000, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'D', 'R', 'R', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :6100, The number of step:0\n",
" The sequence of action is: ['D', 'R', 'U', 'L', 'D', 'D', 'D', 'R', 'R', 'L', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :6200, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'L', 'D', 'R', 'R', 'D', 'U', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :6300, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'D', 'R', 'L', 'R', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :6400, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :6500, The number of step:0\n",
" The sequence of action is: ['D', 'L', 'D', 'D', 'R', 'R', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :6600, The number of step:0\n",
" The sequence of action is: ['R', 'R', 'R', 'D', 'U', 'U', 'L', 'U', 'U', 'R', 'D', 'D', 'L']\n",
"The total reward is: 1\n",
"\n",
"finished at [1, 2]\n",
"episode :6700, The number of step:0\n",
" The sequence of action is: ['D', 'R', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :6800, The number of step:0\n",
" The sequence of action is: ['D', 'U', 'L', 'D', 'D', 'D', 'R', 'R', 'R', 'D', 'D', 'U', 'L', 'R', 'L', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :6900, The number of step:0\n",
" The sequence of action is: ['D', 'L', 'L', 'R', 'L', 'D', 'D', 'R', 'D', 'R', 'L', 'R', 'D', 'U', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :7000, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'D', 'R', 'R', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :7100, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'D', 'R', 'R', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :7200, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :7300, The number of step:0\n",
" The sequence of action is: ['U', 'L', 'L', 'L', 'U', 'R', 'L', 'L', 'D', 'D', 'D', 'R', 'L', 'D', 'R', 'R', 'L', 'R', 'R', 'L', 'U', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :7400, The number of step:0\n",
" The sequence of action is: ['D', 'R', 'L', 'D', 'D', 'R', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :7500, The number of step:0\n",
" The sequence of action is: ['D', 'R', 'L', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :7600, The number of step:0\n",
" The sequence of action is: ['L', 'D', 'D', 'D', 'D', 'D', 'R', 'U', 'R', 'L', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :7700, The number of step:0\n",
" The sequence of action is: ['U', 'U', 'R', 'D', 'L', 'D', 'D', 'D', 'U', 'D', 'U', 'D', 'R', 'R', 'U', 'D', 'U', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :7800, The number of step:0\n",
" The sequence of action is: ['D', 'U', 'D', 'D', 'L', 'D', 'D', 'R', 'U', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :7900, The number of step:0\n",
" The sequence of action is: ['L', 'L', 'R', 'R', 'R', 'D', 'D', 'U', 'D', 'D', 'L', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :8000, The number of step:0\n",
" The sequence of action is: ['U', 'U', 'L', 'D', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :8100, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'L', 'D', 'R', 'D', 'D', 'U', 'R', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :8200, The number of step:0\n",
" The sequence of action is: ['D', 'U', 'D', 'D', 'D', 'D', 'D', 'L', 'R', 'U', 'L', 'R', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :8300, The number of step:0\n",
" The sequence of action is: ['U', 'L', 'U', 'D', 'D', 'D', 'U', 'D', 'R', 'D', 'R', 'U', 'L', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :8400, The number of step:0\n",
" The sequence of action is: ['L', 'D', 'D', 'D', 'R', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :8500, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'D', 'D', 'L', 'R', 'R', 'U', 'D', 'U', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :8600, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'U', 'D', 'D', 'D', 'R', 'R', 'U', 'R', 'L', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :8700, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'D', 'R', 'D', 'R', 'R', 'U', 'L', 'D', 'R', 'U', 'R', 'U', 'L', 'U', 'U', 'R', 'D', 'L', 'D', 'L']\n",
"The total reward is: 1\n",
"\n",
"finished at [1, 2]\n",
"episode :8800, The number of step:0\n",
" The sequence of action is: ['L', 'U', 'R', 'U', 'U', 'D', 'L', 'R', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :8900, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'D', 'D', 'L', 'D', 'R', 'U', 'R', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :9000, The number of step:0\n",
" The sequence of action is: ['R', 'L', 'D', 'D', 'D', 'U', 'D', 'D', 'U', 'D', 'R', 'R', 'U', 'R', 'L', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :9100, The number of step:0\n",
" The sequence of action is: ['L', 'D', 'U', 'U', 'D', 'D', 'D', 'D', 'R', 'D', 'R', 'U', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :9200, The number of step:0\n",
" The sequence of action is: ['R', 'L', 'U', 'D', 'D', 'D', 'D', 'U', 'L', 'D', 'U', 'D', 'R', 'D', 'R', 'U', 'R', 'L', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :9300, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'D', 'D', 'R', 'U', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :9400, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'D', 'R', 'L', 'D', 'D', 'R', 'D', 'L', 'R', 'D', 'U', 'R', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :9500, The number of step:0\n",
" The sequence of action is: ['R', 'L', 'D', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :9600, The number of step:0\n",
" The sequence of action is: ['L', 'D', 'R', 'L', 'D', 'L', 'L', 'U', 'D', 'D', 'D', 'R', 'R', 'U', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 2]\n",
"episode :9700, The number of step:0\n",
" The sequence of action is: ['L', 'D', 'D', 'D', 'D', 'D', 'R', 'R', 'R', 'U', 'L', 'U']\n",
"The total reward is: 1\n",
"\n",
"finished at [1, 2]\n",
"episode :9800, The number of step:0\n",
" The sequence of action is: ['D', 'R', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :9900, The number of step:0\n",
" The sequence of action is: ['D', 'D', 'D', 'R', 'D', 'R', 'D', 'D', 'U', 'R', 'L', 'U']\n",
"The total reward is: 1\n",
"\n",
"The accuracy : 0.0 %\n"
]
}
],
"source": [
"if __name__ =='__main__':\n",
" env = Env()\n",
" agent = SARSA_agent()\n",
" total_episode = 10000\n",
" sr = 0\n",
" \n",
" for episode in range(total_episode):\n",
" action_sequence=[]\n",
" total_reward = 0\n",
" walk = 0\n",
" \n",
" # initial state, action, done\n",
" state = env.reset()\n",
" action = agent.get_action(state)\n",
" done = False\n",
" \n",
" while not done: \n",
" agent.save_actionseq(action_sequence, action)\n",
" \n",
" # next state, action\n",
" next_state, reward, done = env.step(state, action)\n",
" next_action = agent.get_action(next_state)\n",
"\n",
" # update Qtable\n",
" agent.update(state, action, reward, next_state, next_action)\n",
" \n",
" total_reward += reward\n",
" \n",
" if done:\n",
" if episode % 100 == 0:\n",
" print('finished at', next_state)\n",
" print('episode :{}, The number of step:{}\\n The sequence of action is:\\\n",
" {}\\nThe total reward is: {}\\n'.format(episode, walk, action_sequence, total_reward))\n",
" if state == env.goal:\n",
" sr += 1\n",
" break\n",
"\n",
" state = next_state\n",
" action = agent.get_action(state)\n",
" \n",
" \n",
"print('The accuracy :', sr/total_episode*100, '%')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[[-0.02141963, 0.00352836, -0.01987619, -0.06738702],\n",
" [-0.08192269, -0.27941545, -0.02033424, -0.073239 ],\n",
" [-0.02819004, -0.99093702, -0.0900399 , 0.20826751],\n",
" [ 0.17288568, 0.34299325, -0.08307334, 0.12124392],\n",
" [ 0.04265317, 0.26065349, 0.08102561, 0.05298695]],\n",
"\n",
" [[-0.02160627, 0.09308529, -0.01335805, -0.27551594],\n",
" [-0.07458866, -0.99483772, -0.01777086, -0.99528416],\n",
" [ 0. , 0. , 0. , 0. ],\n",
" [ 0.17372915, 0.70905902, -0.94685324, 0.20548302],\n",
" [ 0.07712926, 0.42158818, 0.21059578, 0.16854959]],\n",
"\n",
" [[-0.00889533, 0.32137274, 0.04762571, -0.99999974],\n",
" [ 0. , 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , 0. ],\n",
" [ 0.24003516, 0.47886717, 0.99999894, 0.41158612],\n",
" [ 0.1750948 , 0.18468542, 0.71407025, 0.36118007]],\n",
"\n",
" [[ 0.04119386, 0.40326271, 0.30185009, 0.3305425 ],\n",
" [-0.99988206, 0.45794044, 0.29537902, 0.7825312 ],\n",
" [ 1. , 0.57647976, 0.34688582, 0.54286771],\n",
" [ 0.68094631, 0.33405179, 0.74567044, 0.24925788],\n",
" [ 0.44738387, 0.04159008, 0.27895347, 0.0937303 ]],\n",
"\n",
" [[ 0.29540112, 0.36765815, 0.36934295, 0.48613605],\n",
" [ 0.37722967, 0.47565743, 0.3693205 , 0.60639638],\n",
" [ 0.73857004, 0.57411058, 0.46292071, 0.36906936],\n",
" [ 0.58778137, 0.27510338, 0.43339617, 0.0911307 ],\n",
" [ 0.07047593, 0.03618115, 0.28645965, 0.02472527]]])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agent.Qtable"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([-0.08192269, -0.27941545, -0.02033424, -0.073239 ])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agent.Qtable[0,1]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([-0.07458866, -0.99483772, -0.01777086, -0.99528416])"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agent.Qtable[1,1]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([-0.02160627, 0.09308529, -0.01335805, -0.27551594])"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agent.Qtable[1,0]"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment