Created
January 26, 2018 05:12
-
-
Save denny0323/b833c6c6560436338bbd469f4301c0af to your computer and use it in GitHub Desktop.
3_SARSA
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## SARSA(On-Policy TD Control)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"same in \"2_MC Control agent.ipynb\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class Env:\n", | |
" def __init__(self):\n", | |
" self.grid_width = 5\n", | |
" self.grid_height = self.grid_width\n", | |
" self.action_grid = [(-1, 0), (1, 0), (0, -1), (0, 1)] # U, D, L, R\n", | |
" self.gtriangle1 = [1, 2]\n", | |
" self.gtriangle2 = [2, 1]\n", | |
" self.goal = [2, 2]\n", | |
" \n", | |
" def step(self, state, action):\n", | |
" x, y = state\n", | |
" \n", | |
" # get next state by action\n", | |
" x+= action[0]\n", | |
" y+= action[1]\n", | |
" \n", | |
" if x < 0 :\n", | |
" x = 0\n", | |
" elif x > (self.grid_width-1) :\n", | |
" x = (self.grid_width-1)\n", | |
"\n", | |
" if y < 0 :\n", | |
" y = 0\n", | |
" elif y > (self.grid_width-1) :\n", | |
" y = (self.grid_width-1)\n", | |
" \n", | |
" next_state = [x, y]\n", | |
" \n", | |
" # reward \n", | |
" if next_state == self.gtriangle1 or next_state == self.gtriangle2:\n", | |
" reward = -1\n", | |
" done = True\n", | |
" elif next_state == self.goal:\n", | |
" reward = 1\n", | |
" done = True\n", | |
" else:\n", | |
" reward = 0\n", | |
" done = False\n", | |
" \n", | |
" return next_state, reward, done\n", | |
" \n", | |
" def reset(self):\n", | |
" return [0, 0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class SARSA_agent:\n", | |
" def __init__(self):\n", | |
" self.action_grid = [(-1, 0), (1, 0), (0, -1), (0, 1)]\n", | |
" self.action_text= ['U', 'D', 'L', 'R']\n", | |
" self.grid_width = 5\n", | |
" self.grid_height = self.grid_width\n", | |
" self.Qtable = np.zeros((self.grid_width, self.grid_height, len(self.action_grid)))\n", | |
" self.e = .1\n", | |
" self.learning_rate = .01\n", | |
" self.discount_factor = .95\n", | |
" self.memory=[]\n", | |
" \n", | |
" def get_action(self, state):\n", | |
" # with prob.ε take random action\n", | |
" if np.random.randn() < self.e :\n", | |
" idx = np.random.choice(len(self.action_grid),1)[0]\n", | |
" else :\n", | |
" Qvalues = self.Qtable[tuple(state)]\n", | |
" maxQ = np.amax(Qvalues)\n", | |
" tie_Qchecker = np.where(Qvalues==maxQ)[0]\n", | |
" \n", | |
" # if tie max value, get random\n", | |
" if len(tie_Qchecker) > 1:\n", | |
" idx = np.random.choice(tie_Qchecker, 1)[0]\n", | |
" else :\n", | |
" idx = np.argmax(Qvalues)\n", | |
" \n", | |
" action = self.action_grid[idx]\n", | |
" return action \n", | |
" \n", | |
" # using First visit MC \n", | |
" def update(self, state, action, reward, next_state, next_action):\n", | |
" action_idx = self.action_grid.index(action)\n", | |
" next_action_idx = self.action_grid.index(next_action)\n", | |
" current_Q = self.Qtable[tuple(state)][action_idx]\n", | |
" next_Q = self.Qtable[tuple(next_state)][next_action_idx]\n", | |
" updated_Q = current_Q + self.learning_rate*((reward + self.discount_factor*next_Q)-current_Q)\n", | |
" self.Qtable[tuple(state)][action_idx] = updated_Q\n", | |
" \n", | |
" def save_actionseq(self, action_sequence, action):\n", | |
" idx = self.action_grid.index(action)\n", | |
" action_sequence.append(self.action_text[idx])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"finished at [1, 2]\n", | |
"episode :0, The number of step:0\n", | |
" The sequence of action is: ['R', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :100, The number of step:0\n", | |
" The sequence of action is: ['L', 'L', 'L', 'L', 'U', 'L', 'L', 'U', 'L', 'L', 'R', 'R', 'L', 'D', 'L', 'U', 'D', 'D', 'L', 'U', 'U', 'U', 'D', 'R', 'L', 'U', 'U', 'U', 'R', 'L', 'D', 'R', 'L', 'L', 'U', 'R', 'L', 'U', 'U', 'U', 'L', 'U', 'L', 'L', 'U', 'L', 'U', 'L', 'L', 'D', 'D', 'U', 'U', 'D', 'D', 'L', 'U', 'U', 'U', 'L', 'R', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :200, The number of step:0\n", | |
" The sequence of action is: ['R', 'L', 'L', 'R', 'L', 'U', 'U', 'U', 'D', 'U', 'U', 'D', 'D', 'D', 'D', 'U', 'D', 'U', 'R', 'L', 'D', 'R', 'R', 'U', 'L', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :300, The number of step:0\n", | |
" The sequence of action is: ['L', 'D', 'U', 'R', 'U', 'L', 'D', 'L', 'U', 'L', 'L', 'D', 'U', 'L', 'R', 'L', 'U', 'R', 'U', 'L', 'U', 'L', 'R', 'L', 'U', 'D', 'U', 'U', 'U', 'L', 'D', 'U', 'U', 'U', 'U', 'L', 'U', 'U', 'U', 'U', 'U', 'L', 'R', 'R', 'R', 'D', 'L']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :400, The number of step:0\n", | |
" The sequence of action is: ['L', 'L', 'R', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :500, The number of step:0\n", | |
" The sequence of action is: ['R', 'L', 'L', 'D', 'U', 'L', 'L', 'D', 'U', 'U', 'U', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'D', 'R', 'U', 'U', 'L', 'L', 'L', 'L', 'U', 'U', 'L', 'U', 'U', 'U', 'L', 'L', 'D', 'U', 'U', 'L', 'L', 'R', 'R', 'R', 'D', 'D', 'L']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :600, The number of step:0\n", | |
" The sequence of action is: ['U', 'R', 'L', 'U', 'R', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'U', 'U', 'U', 'L', 'R', 'L', 'L', 'U', 'U', 'U', 'U', 'R', 'U', 'R', 'R', 'D', 'D', 'D', 'L', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :700, The number of step:0\n", | |
" The sequence of action is: ['U', 'R', 'L', 'L', 'L', 'U', 'L', 'D', 'U', 'U', 'R', 'L', 'U', 'U', 'U', 'U', 'R', 'U', 'R', 'R', 'D', 'R', 'L', 'R', 'D', 'L', 'R', 'D', 'U', 'L', 'L']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :800, The number of step:0\n", | |
" The sequence of action is: ['R', 'R', 'R', 'D', 'D', 'L']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :900, The number of step:0\n", | |
" The sequence of action is: ['R', 'L', 'L', 'U', 'U', 'U', 'D', 'U', 'R', 'R', 'R', 'U', 'R', 'D', 'U', 'D', 'D', 'L', 'D', 'L', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :1000, The number of step:0\n", | |
" The sequence of action is: ['R', 'L', 'U', 'U', 'D', 'U', 'U', 'U', 'U', 'D', 'U', 'U', 'L', 'L', 'U', 'U', 'R', 'D', 'U', 'L', 'U', 'R', 'L', 'U', 'R', 'D', 'U', 'U', 'D', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :1100, The number of step:0\n", | |
" The sequence of action is: ['R', 'L', 'U', 'D', 'D', 'D', 'R', 'R', 'L', 'R', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :1200, The number of step:0\n", | |
" The sequence of action is: ['U', 'U', 'U', 'U', 'R', 'L', 'R', 'L', 'L', 'U', 'D', 'U', 'R', 'D', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :1300, The number of step:0\n", | |
" The sequence of action is: ['U', 'R', 'L', 'U', 'D', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :1400, The number of step:0\n", | |
" The sequence of action is: ['U', 'L', 'L', 'L', 'U', 'U', 'U', 'U', 'R', 'L', 'U', 'U', 'L', 'U', 'U', 'L', 'R', 'L', 'U', 'U', 'U', 'L', 'U', 'L', 'U', 'U', 'U', 'L', 'R', 'R', 'L', 'L', 'U', 'L', 'L', 'L', 'D', 'D', 'U', 'U', 'D', 'L', 'U', 'U', 'L', 'L', 'L', 'U', 'U', 'R', 'L', 'L', 'D', 'R', 'U', 'L', 'D', 'R', 'U', 'U', 'L', 'R', 'U', 'L', 'D', 'D', 'U', 'R', 'U', 'U', 'U', 'L', 'D', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :1500, The number of step:0\n", | |
" The sequence of action is: ['U', 'U', 'R', 'L', 'U', 'L', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'D', 'D', 'D', 'U', 'L', 'D', 'L', 'R', 'L', 'L', 'R', 'L', 'R', 'R', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :1600, The number of step:0\n", | |
" The sequence of action is: ['D', 'U', 'L', 'L', 'D', 'L', 'R', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :1700, The number of step:0\n", | |
" The sequence of action is: ['L', 'U', 'L', 'U', 'R', 'L', 'U', 'L', 'U', 'U', 'U', 'L', 'U', 'R', 'R', 'R', 'U', 'D', 'D', 'U', 'U', 'D', 'D', 'R', 'L', 'L']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :1800, The number of step:0\n", | |
" The sequence of action is: ['R', 'U', 'R', 'L', 'R', 'L', 'R', 'R', 'D', 'R', 'D', 'L', 'L']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :1900, The number of step:0\n", | |
" The sequence of action is: ['U', 'U', 'U', 'L', 'L', 'L', 'R', 'D', 'U', 'R', 'L', 'L', 'D', 'D', 'D', 'R', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :2000, The number of step:0\n", | |
" The sequence of action is: ['L', 'U', 'R', 'L', 'L', 'U', 'D', 'R', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :2100, The number of step:0\n", | |
" The sequence of action is: ['U', 'L', 'L', 'R', 'L', 'U', 'R', 'L', 'U', 'D', 'D', 'U', 'D', 'U', 'R', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :2200, The number of step:0\n", | |
" The sequence of action is: ['U', 'R', 'D', 'U', 'U', 'L', 'R', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :2300, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'D', 'L', 'L', 'D', 'R', 'L', 'R', 'L', 'R', 'R', 'U', 'D', 'L', 'R', 'R', 'D', 'R', 'U', 'U', 'L', 'L']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :2400, The number of step:0\n", | |
" The sequence of action is: ['R', 'R', 'R', 'D', 'D', 'L']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :2500, The number of step:0\n", | |
" The sequence of action is: ['R', 'R', 'U', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :2600, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'D', 'D', 'R', 'R', 'L', 'R', 'U', 'R', 'L', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :2700, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'D', 'U', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :2800, The number of step:0\n", | |
" The sequence of action is: ['L', 'U', 'D', 'D', 'D', 'L', 'R', 'L', 'L', 'U', 'L', 'D', 'R', 'R', 'L', 'R', 'L', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :2900, The number of step:0\n", | |
" The sequence of action is: ['D', 'R', 'L', 'D', 'D', 'R', 'L', 'R', 'R', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :3000, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'D', 'L', 'R', 'R', 'R', 'L', 'R', 'U', 'L']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :3100, The number of step:0\n", | |
" The sequence of action is: ['U', 'R', 'L', 'R', 'L', 'R', 'L', 'R', 'L', 'U', 'U', 'U', 'D', 'R', 'L', 'L', 'U', 'D', 'U', 'D', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :3200, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'D', 'R', 'L', 'U', 'D', 'R', 'R', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :3300, The number of step:0\n", | |
" The sequence of action is: ['U', 'U', 'R', 'D', 'L', 'D', 'D', 'R', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :3400, The number of step:0\n", | |
" The sequence of action is: ['U', 'D', 'D', 'L', 'D', 'R', 'R', 'L', 'R', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :3500, The number of step:0\n", | |
" The sequence of action is: ['U', 'U', 'U', 'U', 'R', 'R', 'R', 'L', 'R', 'R', 'L', 'U', 'D', 'L']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :3600, The number of step:0\n", | |
" The sequence of action is: ['U', 'L', 'L', 'R', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :3700, The number of step:0\n", | |
" The sequence of action is: ['U', 'R', 'R', 'R', 'R', 'D', 'U', 'L', 'D', 'D', 'U', 'D', 'R', 'U', 'D', 'L', 'U', 'D', 'L']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :3800, The number of step:0\n", | |
" The sequence of action is: ['U', 'U', 'L', 'L', 'U', 'U', 'L', 'U', 'U', 'R', 'D', 'U', 'U', 'R', 'R', 'D', 'R', 'U', 'D', 'L', 'L']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :3900, The number of step:0\n", | |
" The sequence of action is: ['U', 'R', 'R', 'U', 'R', 'D', 'D', 'R', 'L', 'L']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :4000, The number of step:0\n", | |
" The sequence of action is: ['U', 'U', 'R', 'R', 'R', 'D', 'U', 'D', 'L']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :4100, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'U', 'L', 'R', 'L', 'D', 'D', 'R', 'L', 'L', 'U', 'D', 'U', 'D', 'R', 'R', 'D', 'U', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :4200, The number of step:0\n", | |
" The sequence of action is: ['L', 'U', 'D', 'U', 'U', 'U', 'U', 'U', 'U', 'L', 'U', 'U', 'U', 'U', 'R', 'R', 'R', 'D', 'D', 'L']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :4300, The number of step:0\n", | |
" The sequence of action is: ['U', 'D', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :4400, The number of step:0\n", | |
" The sequence of action is: ['U', 'D', 'U', 'D', 'L', 'U', 'U', 'U', 'U', 'L', 'L', 'R', 'R', 'R', 'D', 'D', 'L']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :4500, The number of step:0\n", | |
" The sequence of action is: ['L', 'U', 'R', 'R', 'L', 'L', 'D', 'U', 'U', 'U', 'R', 'R', 'R', 'L', 'R', 'L', 'R', 'U', 'D', 'D', 'U', 'L']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :4600, The number of step:0\n", | |
" The sequence of action is: ['L', 'R', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :4700, The number of step:0\n", | |
" The sequence of action is: ['D', 'U', 'L', 'R', 'R', 'R', 'D', 'U', 'D', 'D', 'D', 'U', 'L']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :4800, The number of step:0\n", | |
" The sequence of action is: ['U', 'U', 'D', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :4900, The number of step:0\n", | |
" The sequence of action is: ['U', 'U', 'U', 'L', 'U', 'U', 'U', 'U', 'R', 'R', 'U', 'U', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :5000, The number of step:0\n", | |
" The sequence of action is: ['U', 'R', 'R', 'R', 'D', 'D', 'L']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :5100, The number of step:0\n", | |
" The sequence of action is: ['L', 'U', 'L', 'L', 'L', 'D', 'D', 'U', 'D', 'D', 'R', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :5200, The number of step:0\n", | |
" The sequence of action is: ['L', 'D', 'D', 'U', 'L', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :5300, The number of step:0\n", | |
" The sequence of action is: ['U', 'D', 'D', 'D', 'D', 'R', 'R', 'U', 'R', 'L', 'L', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :5400, The number of step:0\n", | |
" The sequence of action is: ['U', 'U', 'U', 'L', 'L', 'D', 'L', 'U', 'L', 'L', 'D', 'L', 'D', 'D', 'R', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :5500, The number of step:0\n", | |
" The sequence of action is: ['U', 'D', 'R', 'L', 'U', 'D', 'D', 'D', 'R', 'R', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :5600, The number of step:0\n", | |
" The sequence of action is: ['L', 'U', 'D', 'D', 'D', 'D', 'R', 'R', 'L', 'U', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :5700, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'D', 'U', 'D', 'R', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :5800, The number of step:0\n", | |
" The sequence of action is: ['U', 'D', 'L', 'R', 'L', 'U', 'D', 'L', 'D', 'D', 'U', 'D', 'L', 'R', 'R', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :5900, The number of step:0\n", | |
" The sequence of action is: ['D', 'U', 'R', 'R', 'R', 'D', 'R', 'D', 'U', 'D', 'D', 'U', 'L', 'L']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :6000, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'D', 'R', 'R', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :6100, The number of step:0\n", | |
" The sequence of action is: ['D', 'R', 'U', 'L', 'D', 'D', 'D', 'R', 'R', 'L', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :6200, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'L', 'D', 'R', 'R', 'D', 'U', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :6300, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'D', 'R', 'L', 'R', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :6400, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :6500, The number of step:0\n", | |
" The sequence of action is: ['D', 'L', 'D', 'D', 'R', 'R', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :6600, The number of step:0\n", | |
" The sequence of action is: ['R', 'R', 'R', 'D', 'U', 'U', 'L', 'U', 'U', 'R', 'D', 'D', 'L']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :6700, The number of step:0\n", | |
" The sequence of action is: ['D', 'R', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :6800, The number of step:0\n", | |
" The sequence of action is: ['D', 'U', 'L', 'D', 'D', 'D', 'R', 'R', 'R', 'D', 'D', 'U', 'L', 'R', 'L', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :6900, The number of step:0\n", | |
" The sequence of action is: ['D', 'L', 'L', 'R', 'L', 'D', 'D', 'R', 'D', 'R', 'L', 'R', 'D', 'U', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :7000, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'D', 'R', 'R', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :7100, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'D', 'R', 'R', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :7200, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :7300, The number of step:0\n", | |
" The sequence of action is: ['U', 'L', 'L', 'L', 'U', 'R', 'L', 'L', 'D', 'D', 'D', 'R', 'L', 'D', 'R', 'R', 'L', 'R', 'R', 'L', 'U', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :7400, The number of step:0\n", | |
" The sequence of action is: ['D', 'R', 'L', 'D', 'D', 'R', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :7500, The number of step:0\n", | |
" The sequence of action is: ['D', 'R', 'L', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :7600, The number of step:0\n", | |
" The sequence of action is: ['L', 'D', 'D', 'D', 'D', 'D', 'R', 'U', 'R', 'L', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :7700, The number of step:0\n", | |
" The sequence of action is: ['U', 'U', 'R', 'D', 'L', 'D', 'D', 'D', 'U', 'D', 'U', 'D', 'R', 'R', 'U', 'D', 'U', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :7800, The number of step:0\n", | |
" The sequence of action is: ['D', 'U', 'D', 'D', 'L', 'D', 'D', 'R', 'U', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :7900, The number of step:0\n", | |
" The sequence of action is: ['L', 'L', 'R', 'R', 'R', 'D', 'D', 'U', 'D', 'D', 'L', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :8000, The number of step:0\n", | |
" The sequence of action is: ['U', 'U', 'L', 'D', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :8100, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'L', 'D', 'R', 'D', 'D', 'U', 'R', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :8200, The number of step:0\n", | |
" The sequence of action is: ['D', 'U', 'D', 'D', 'D', 'D', 'D', 'L', 'R', 'U', 'L', 'R', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :8300, The number of step:0\n", | |
" The sequence of action is: ['U', 'L', 'U', 'D', 'D', 'D', 'U', 'D', 'R', 'D', 'R', 'U', 'L', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :8400, The number of step:0\n", | |
" The sequence of action is: ['L', 'D', 'D', 'D', 'R', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :8500, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'D', 'D', 'L', 'R', 'R', 'U', 'D', 'U', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :8600, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'U', 'D', 'D', 'D', 'R', 'R', 'U', 'R', 'L', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :8700, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'D', 'R', 'D', 'R', 'R', 'U', 'L', 'D', 'R', 'U', 'R', 'U', 'L', 'U', 'U', 'R', 'D', 'L', 'D', 'L']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :8800, The number of step:0\n", | |
" The sequence of action is: ['L', 'U', 'R', 'U', 'U', 'D', 'L', 'R', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :8900, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'D', 'D', 'L', 'D', 'R', 'U', 'R', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :9000, The number of step:0\n", | |
" The sequence of action is: ['R', 'L', 'D', 'D', 'D', 'U', 'D', 'D', 'U', 'D', 'R', 'R', 'U', 'R', 'L', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :9100, The number of step:0\n", | |
" The sequence of action is: ['L', 'D', 'U', 'U', 'D', 'D', 'D', 'D', 'R', 'D', 'R', 'U', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :9200, The number of step:0\n", | |
" The sequence of action is: ['R', 'L', 'U', 'D', 'D', 'D', 'D', 'U', 'L', 'D', 'U', 'D', 'R', 'D', 'R', 'U', 'R', 'L', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :9300, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'D', 'D', 'R', 'U', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :9400, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'D', 'R', 'L', 'D', 'D', 'R', 'D', 'L', 'R', 'D', 'U', 'R', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :9500, The number of step:0\n", | |
" The sequence of action is: ['R', 'L', 'D', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :9600, The number of step:0\n", | |
" The sequence of action is: ['L', 'D', 'R', 'L', 'D', 'L', 'L', 'U', 'D', 'D', 'D', 'R', 'R', 'U', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :9700, The number of step:0\n", | |
" The sequence of action is: ['L', 'D', 'D', 'D', 'D', 'D', 'R', 'R', 'R', 'U', 'L', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :9800, The number of step:0\n", | |
" The sequence of action is: ['D', 'R', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :9900, The number of step:0\n", | |
" The sequence of action is: ['D', 'D', 'D', 'R', 'D', 'R', 'D', 'D', 'U', 'R', 'L', 'U']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"The accuracy : 0.0 %\n" | |
] | |
} | |
], | |
"source": [ | |
"if __name__ =='__main__':\n", | |
" env = Env()\n", | |
" agent = SARSA_agent()\n", | |
" total_episode = 10000\n", | |
" sr = 0\n", | |
" \n", | |
" for episode in range(total_episode):\n", | |
" action_sequence=[]\n", | |
" total_reward = 0\n", | |
" walk = 0\n", | |
" \n", | |
" # initial state, action, done\n", | |
" state = env.reset()\n", | |
" action = agent.get_action(state)\n", | |
" done = False\n", | |
" \n", | |
" while not done: \n", | |
" agent.save_actionseq(action_sequence, action)\n", | |
" \n", | |
" # next state, action\n", | |
" next_state, reward, done = env.step(state, action)\n", | |
" next_action = agent.get_action(next_state)\n", | |
"\n", | |
" # update Qtable\n", | |
" agent.update(state, action, reward, next_state, next_action)\n", | |
" \n", | |
" total_reward += reward\n", | |
" \n", | |
" if done:\n", | |
" if episode % 100 == 0:\n", | |
" print('finished at', next_state)\n", | |
" print('episode :{}, The number of step:{}\\n The sequence of action is:\\\n", | |
" {}\\nThe total reward is: {}\\n'.format(episode, walk, action_sequence, total_reward))\n", | |
" if state == env.goal:\n", | |
" sr += 1\n", | |
" break\n", | |
"\n", | |
" state = next_state\n", | |
" action = agent.get_action(state)\n", | |
" \n", | |
" \n", | |
"print('The accuracy :', sr/total_episode*100, '%')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[[-0.02141963, 0.00352836, -0.01987619, -0.06738702],\n", | |
" [-0.08192269, -0.27941545, -0.02033424, -0.073239 ],\n", | |
" [-0.02819004, -0.99093702, -0.0900399 , 0.20826751],\n", | |
" [ 0.17288568, 0.34299325, -0.08307334, 0.12124392],\n", | |
" [ 0.04265317, 0.26065349, 0.08102561, 0.05298695]],\n", | |
"\n", | |
" [[-0.02160627, 0.09308529, -0.01335805, -0.27551594],\n", | |
" [-0.07458866, -0.99483772, -0.01777086, -0.99528416],\n", | |
" [ 0. , 0. , 0. , 0. ],\n", | |
" [ 0.17372915, 0.70905902, -0.94685324, 0.20548302],\n", | |
" [ 0.07712926, 0.42158818, 0.21059578, 0.16854959]],\n", | |
"\n", | |
" [[-0.00889533, 0.32137274, 0.04762571, -0.99999974],\n", | |
" [ 0. , 0. , 0. , 0. ],\n", | |
" [ 0. , 0. , 0. , 0. ],\n", | |
" [ 0.24003516, 0.47886717, 0.99999894, 0.41158612],\n", | |
" [ 0.1750948 , 0.18468542, 0.71407025, 0.36118007]],\n", | |
"\n", | |
" [[ 0.04119386, 0.40326271, 0.30185009, 0.3305425 ],\n", | |
" [-0.99988206, 0.45794044, 0.29537902, 0.7825312 ],\n", | |
" [ 1. , 0.57647976, 0.34688582, 0.54286771],\n", | |
" [ 0.68094631, 0.33405179, 0.74567044, 0.24925788],\n", | |
" [ 0.44738387, 0.04159008, 0.27895347, 0.0937303 ]],\n", | |
"\n", | |
" [[ 0.29540112, 0.36765815, 0.36934295, 0.48613605],\n", | |
" [ 0.37722967, 0.47565743, 0.3693205 , 0.60639638],\n", | |
" [ 0.73857004, 0.57411058, 0.46292071, 0.36906936],\n", | |
" [ 0.58778137, 0.27510338, 0.43339617, 0.0911307 ],\n", | |
" [ 0.07047593, 0.03618115, 0.28645965, 0.02472527]]])" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"agent.Qtable" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([-0.08192269, -0.27941545, -0.02033424, -0.073239 ])" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"agent.Qtable[0,1]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([-0.07458866, -0.99483772, -0.01777086, -0.99528416])" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"agent.Qtable[1,1]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([-0.02160627, 0.09308529, -0.01335805, -0.27551594])" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"agent.Qtable[1,0]" | |
] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [default]", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment