Last active
January 26, 2018 04:50
-
-
Save denny0323/9ae110047d81001a11eefb57d8f76819 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# On-Policy Monte-Carlo Control" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class Env:\n", | |
" def __init__(self):\n", | |
" self.grid_width = 5\n", | |
" self.grid_height = self.grid_width\n", | |
" self.action_grid = [(-1, 0), (1, 0), (0, -1), (0, 1)] # U, D, L, R\n", | |
" self.gtriangle1 = [1, 2]\n", | |
" self.gtriangle2 = [2, 1]\n", | |
" self.goal = [2, 2]\n", | |
" \n", | |
" def step(self, state, action):\n", | |
" x, y = state\n", | |
" \n", | |
" # get next state by action\n", | |
" x+= action[0]\n", | |
" y+= action[1]\n", | |
" \n", | |
" if x < 0 :\n", | |
" x = 0\n", | |
" elif x > (self.grid_width-1) :\n", | |
" x = (self.grid_width-1)\n", | |
"\n", | |
" if y < 0 :\n", | |
" y = 0\n", | |
" elif y > (self.grid_width-1) :\n", | |
" y = (self.grid_width-1)\n", | |
" \n", | |
" next_state = [x, y]\n", | |
" \n", | |
" # reward \n", | |
" if next_state == self.gtriangle1 or next_state == self.gtriangle2:\n", | |
" reward = -1\n", | |
" done = True\n", | |
" elif next_state == self.goal:\n", | |
" reward = 1\n", | |
" done = True\n", | |
" else:\n", | |
" reward = 0\n", | |
" done = False\n", | |
" \n", | |
" return next_state, reward, done\n", | |
" \n", | |
" def reset(self):\n", | |
" return [0, 0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class MC_agent:\n", | |
" def __init__(self):\n", | |
" self.action_grid = [(-1, 0), (1, 0), (0, -1), (0, 1)]\n", | |
" self.action_text= ['U', 'D', 'L', 'R']\n", | |
" self.grid_width = 5\n", | |
" self.grid_height = self.grid_width\n", | |
" self.value_table = np.zeros((self.grid_width, self.grid_height))\n", | |
" self.e = .1\n", | |
" self.learning_rate = .01\n", | |
" self.discount_factor = .95\n", | |
" self.memory=[]\n", | |
" \n", | |
" def get_action(self, state):\n", | |
" # with prob.ε take random action\n", | |
" if np.random.randn() < self.e :\n", | |
" idx = np.random.choice(len(self.action_grid),1)[0]\n", | |
" else :\n", | |
" next_values = np.array([])\n", | |
" for s in self.next_states(state):\n", | |
" next_values= np.append(next_values, self.value_table[tuple(s)])\n", | |
" max_value = np.amax(next_values)\n", | |
" tie_Qchecker = np.where(next_values==max_value)[0]\n", | |
" \n", | |
" # if tie max value, get random\n", | |
" if len(tie_Qchecker) > 1:\n", | |
" idx = np.random.choice(tie_Qchecker, 1)[0]\n", | |
" else :\n", | |
" idx = np.argmax(next_values)\n", | |
" action = self.action_grid[idx]\n", | |
" return action\n", | |
" \n", | |
" def next_states(self, state):\n", | |
" x, y = state\n", | |
" next_S = []\n", | |
" for action in self.action_grid:\n", | |
" # calculate x_coordinate\n", | |
" x+=action[0]\n", | |
" if x < 0:\n", | |
" x = 0\n", | |
" elif x > 4:\n", | |
" x = 4 \n", | |
" # calculate x_coordinate\n", | |
" y+=action[1]\n", | |
" if x < 0:\n", | |
" x = 0\n", | |
" elif x > 4:\n", | |
" x = 4\n", | |
" next_S.append([x, y]) \n", | |
" return next_S \n", | |
" \n", | |
" # using First visit MC \n", | |
" def update(self):\n", | |
" G_t = 0\n", | |
" visit_states=[]\n", | |
" for sample in reversed(self.memory):\n", | |
" state = sample[0]\n", | |
" reward = sample[1]\n", | |
" if state not in visit_states:\n", | |
" visit_states.append(state)\n", | |
" G_t = reward + self.discount_factor*G_t\n", | |
" V_t = self.value_table[tuple(state)]\n", | |
" # update Value\n", | |
" self.value_table[tuple(state)] = V_t + self.learning_rate*(G_t - V_t)\n", | |
" \n", | |
" def memorizer(self, state, reward, done):\n", | |
" self.memory.append([state, reward, done])\n", | |
" \n", | |
" def save_actionseq(self, action_sequence, action):\n", | |
" idx = self.action_grid.index(action)\n", | |
" action_sequence.append(self.action_text[idx])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"finished at [1, 2]\n", | |
"episode :0, The number of step:42\n", | |
" The sequence of action is: ['R', 'U', 'L', 'R', 'R', 'L', 'L', 'U', 'R', 'U', 'U', 'L', 'R', 'L', 'L', 'U', 'R', 'D', 'U', 'U', 'D', 'U', 'R', 'L', 'L', 'R', 'R', 'L', 'R', 'R', 'R', 'D', 'R', 'R', 'U', 'D', 'R', 'U', 'U', 'L', 'L', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :100, The number of step:15\n", | |
" The sequence of action is: ['L', 'L', 'L', 'L', 'L', 'L', 'D', 'U', 'L', 'D', 'R', 'U', 'U', 'D', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :200, The number of step:70\n", | |
" The sequence of action is: ['L', 'L', 'L', 'U', 'L', 'D', 'U', 'U', 'L', 'L', 'L', 'R', 'L', 'D', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'D', 'U', 'D', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'U', 'L', 'D', 'D', 'L', 'L', 'L', 'L', 'U', 'D', 'L', 'L', 'U', 'L', 'R', 'L', 'U', 'L', 'L', 'L', 'L', 'R', 'L', 'L', 'D', 'U', 'L', 'L', 'R', 'L', 'D', 'L', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :300, The number of step:10\n", | |
" The sequence of action is: ['L', 'L', 'L', 'D', 'L', 'R', 'L', 'L', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :400, The number of step:10\n", | |
" The sequence of action is: ['L', 'R', 'D', 'U', 'U', 'U', 'R', 'R', 'D', 'L']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :500, The number of step:9\n", | |
" The sequence of action is: ['L', 'L', 'U', 'L', 'L', 'D', 'L', 'R', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :600, The number of step:46\n", | |
" The sequence of action is: ['L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'D', 'L', 'U', 'L', 'L', 'R', 'U', 'U', 'U', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'D', 'R', 'D', 'U', 'R', 'D', 'L', 'D', 'D', 'L', 'U', 'R', 'D', 'U', 'U', 'U', 'L']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :700, The number of step:7\n", | |
" The sequence of action is: ['D', 'L', 'R', 'L', 'L', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :800, The number of step:49\n", | |
" The sequence of action is: ['L', 'L', 'U', 'R', 'L', 'R', 'L', 'L', 'L', 'R', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'L', 'R', 'L', 'R', 'U', 'L', 'U', 'U', 'U', 'L', 'L', 'D', 'U', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :900, The number of step:19\n", | |
" The sequence of action is: ['L', 'L', 'L', 'R', 'L', 'R', 'L', 'R', 'U', 'D', 'U', 'U', 'L', 'L', 'L', 'U', 'D', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :1000, The number of step:40\n", | |
" The sequence of action is: ['L', 'L', 'L', 'R', 'L', 'L', 'L', 'L', 'D', 'U', 'D', 'U', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'U', 'L', 'D', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :1100, The number of step:10\n", | |
" The sequence of action is: ['L', 'L', 'U', 'L', 'L', 'U', 'D', 'L', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :1200, The number of step:35\n", | |
" The sequence of action is: ['L', 'U', 'L', 'L', 'L', 'L', 'L', 'U', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'U', 'D', 'U', 'U', 'L', 'L', 'U', 'D', 'L', 'D', 'D', 'R', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :1300, The number of step:27\n", | |
" The sequence of action is: ['L', 'D', 'L', 'L', 'U', 'L', 'U', 'U', 'L', 'L', 'R', 'D', 'L', 'L', 'U', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'U', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :1400, The number of step:27\n", | |
" The sequence of action is: ['U', 'D', 'D', 'L', 'U', 'D', 'L', 'D', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'U', 'U', 'R', 'U', 'L', 'L', 'U', 'R', 'U', 'R', 'U', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :1500, The number of step:10\n", | |
" The sequence of action is: ['L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :1600, The number of step:19\n", | |
" The sequence of action is: ['D', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'U', 'L', 'L', 'L', 'U', 'L', 'R', 'D', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :1700, The number of step:7\n", | |
" The sequence of action is: ['L', 'U', 'D', 'L', 'D', 'L', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :1800, The number of step:35\n", | |
" The sequence of action is: ['L', 'L', 'L', 'U', 'R', 'L', 'R', 'L', 'U', 'L', 'D', 'L', 'L', 'L', 'L', 'R', 'L', 'R', 'U', 'L', 'L', 'U', 'U', 'L', 'L', 'D', 'U', 'D', 'R', 'L', 'R', 'L', 'L', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :1900, The number of step:7\n", | |
" The sequence of action is: ['L', 'U', 'L', 'R', 'U', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :2000, The number of step:18\n", | |
" The sequence of action is: ['L', 'D', 'L', 'R', 'L', 'L', 'L', 'L', 'L', 'R', 'L', 'L', 'U', 'L', 'D', 'L', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :2100, The number of step:4\n", | |
" The sequence of action is: ['U', 'R', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :2200, The number of step:8\n", | |
" The sequence of action is: ['U', 'L', 'L', 'D', 'D', 'L', 'L', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :2300, The number of step:4\n", | |
" The sequence of action is: ['D', 'D', 'L', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :2400, The number of step:11\n", | |
" The sequence of action is: ['L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :2500, The number of step:47\n", | |
" The sequence of action is: ['L', 'L', 'L', 'L', 'D', 'L', 'R', 'U', 'L', 'U', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'D', 'L', 'U', 'D', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'R', 'L', 'L', 'R', 'L', 'R', 'D', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :2600, The number of step:3\n", | |
" The sequence of action is: ['R', 'D', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :2700, The number of step:19\n", | |
" The sequence of action is: ['L', 'U', 'U', 'L', 'R', 'U', 'L', 'U', 'R', 'R', 'R', 'R', 'D', 'U', 'L', 'L', 'R', 'L', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :2800, The number of step:29\n", | |
" The sequence of action is: ['L', 'L', 'L', 'R', 'U', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'R', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :2900, The number of step:11\n", | |
" The sequence of action is: ['D', 'D', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'R', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :3000, The number of step:13\n", | |
" The sequence of action is: ['U', 'L', 'L', 'D', 'U', 'L', 'L', 'D', 'U', 'L', 'R', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :3100, The number of step:35\n", | |
" The sequence of action is: ['L', 'D', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'R', 'U', 'U', 'D', 'U', 'D', 'U', 'L', 'U', 'L', 'U', 'L', 'R', 'L', 'D', 'L', 'L', 'L', 'U', 'D', 'R', 'U', 'U', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :3200, The number of step:30\n", | |
" The sequence of action is: ['D', 'L', 'U', 'D', 'R', 'L', 'D', 'L', 'D', 'L', 'U', 'L', 'D', 'L', 'U', 'D', 'R', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :3300, The number of step:5\n", | |
" The sequence of action is: ['L', 'U', 'R', 'D', 'D']\n", | |
"The total reward is: -1\n", | |
"\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"finished at [1, 2]\n", | |
"episode :3400, The number of step:8\n", | |
" The sequence of action is: ['L', 'L', 'L', 'L', 'L', 'D', 'R', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :3500, The number of step:9\n", | |
" The sequence of action is: ['R', 'R', 'U', 'R', 'U', 'L', 'U', 'U', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :3600, The number of step:68\n", | |
" The sequence of action is: ['L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'L', 'L', 'R', 'L', 'U', 'L', 'L', 'L', 'L', 'D', 'D', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'U', 'U', 'U', 'L', 'L', 'U', 'R', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'D', 'L', 'L', 'R', 'L', 'L', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :3700, The number of step:12\n", | |
" The sequence of action is: ['L', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'U', 'D', 'R', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :3800, The number of step:82\n", | |
" The sequence of action is: ['D', 'R', 'L', 'R', 'U', 'U', 'L', 'U', 'U', 'U', 'L', 'L', 'L', 'L', 'L', 'R', 'L', 'L', 'L', 'L', 'L', 'R', 'D', 'U', 'L', 'L', 'U', 'L', 'L', 'R', 'L', 'D', 'L', 'U', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'U', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'U', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'L', 'L', 'D', 'L', 'L', 'U', 'L', 'U', 'D', 'L', 'U', 'U', 'R', 'L', 'D', 'L', 'L', 'U', 'D', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :3900, The number of step:16\n", | |
" The sequence of action is: ['D', 'L', 'L', 'L', 'D', 'U', 'D', 'U', 'U', 'R', 'L', 'D', 'D', 'L', 'L', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :4000, The number of step:4\n", | |
" The sequence of action is: ['U', 'R', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :4100, The number of step:20\n", | |
" The sequence of action is: ['L', 'L', 'U', 'L', 'U', 'L', 'D', 'L', 'L', 'L', 'U', 'U', 'D', 'D', 'L', 'D', 'D', 'U', 'U', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :4200, The number of step:6\n", | |
" The sequence of action is: ['R', 'L', 'L', 'R', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :4300, The number of step:9\n", | |
" The sequence of action is: ['L', 'L', 'L', 'L', 'L', 'L', 'D', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :4400, The number of step:9\n", | |
" The sequence of action is: ['D', 'L', 'D', 'D', 'L', 'R', 'L', 'R', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :4500, The number of step:23\n", | |
" The sequence of action is: ['L', 'R', 'L', 'D', 'L', 'L', 'R', 'U', 'L', 'R', 'U', 'L', 'U', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :4600, The number of step:42\n", | |
" The sequence of action is: ['D', 'L', 'U', 'L', 'U', 'L', 'L', 'U', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'R', 'U', 'L', 'R', 'L', 'L', 'L', 'U', 'R', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :4700, The number of step:20\n", | |
" The sequence of action is: ['L', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'U', 'R', 'L', 'L', 'L', 'L', 'L', 'D', 'D', 'L', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :4800, The number of step:34\n", | |
" The sequence of action is: ['L', 'L', 'L', 'L', 'D', 'D', 'L', 'L', 'U', 'U', 'U', 'D', 'L', 'L', 'L', 'U', 'R', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'D', 'L', 'D', 'U', 'L', 'L', 'L', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :4900, The number of step:9\n", | |
" The sequence of action is: ['D', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :5000, The number of step:19\n", | |
" The sequence of action is: ['D', 'L', 'R', 'U', 'D', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'R', 'R', 'L', 'U', 'R', 'L', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :5100, The number of step:29\n", | |
" The sequence of action is: ['D', 'L', 'L', 'L', 'L', 'U', 'D', 'L', 'L', 'L', 'D', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'R', 'R', 'U', 'R', 'U', 'L']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :5200, The number of step:12\n", | |
" The sequence of action is: ['L', 'L', 'D', 'D', 'D', 'D', 'D', 'L', 'L', 'U', 'R', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :5300, The number of step:45\n", | |
" The sequence of action is: ['L', 'L', 'L', 'R', 'L', 'R', 'D', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'D', 'L', 'L', 'D', 'U', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'U', 'L', 'D', 'D', 'L', 'L', 'D', 'L', 'R', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :5400, The number of step:39\n", | |
" The sequence of action is: ['L', 'D', 'L', 'U', 'L', 'L', 'L', 'R', 'U', 'U', 'L', 'R', 'L', 'L', 'L', 'R', 'L', 'L', 'U', 'D', 'L', 'L', 'L', 'L', 'D', 'D', 'L', 'U', 'L', 'L', 'D', 'D', 'L', 'L', 'L', 'U', 'L', 'U', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :5500, The number of step:7\n", | |
" The sequence of action is: ['U', 'L', 'R', 'U', 'U', 'D', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :5600, The number of step:19\n", | |
" The sequence of action is: ['U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :5700, The number of step:16\n", | |
" The sequence of action is: ['L', 'L', 'R', 'L', 'R', 'L', 'L', 'L', 'L', 'L', 'D', 'U', 'L', 'R', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :5800, The number of step:3\n", | |
" The sequence of action is: ['R', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :5900, The number of step:41\n", | |
" The sequence of action is: ['L', 'L', 'L', 'L', 'R', 'L', 'L', 'L', 'L', 'R', 'L', 'U', 'U', 'U', 'D', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'U', 'U', 'U', 'L', 'L', 'U', 'L', 'R', 'R', 'R', 'L', 'R', 'U', 'L', 'R', 'L', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :6000, The number of step:46\n", | |
" The sequence of action is: ['L', 'L', 'D', 'L', 'L', 'R', 'L', 'U', 'L', 'U', 'D', 'R', 'U', 'R', 'L', 'U', 'U', 'U', 'D', 'U', 'L', 'L', 'L', 'L', 'L', 'D', 'D', 'U', 'L', 'L', 'U', 'L', 'L', 'D', 'L', 'R', 'U', 'U', 'U', 'L', 'L', 'R', 'U', 'U', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :6100, The number of step:50\n", | |
" The sequence of action is: ['L', 'L', 'D', 'R', 'U', 'L', 'L', 'D', 'R', 'L', 'U', 'L', 'R', 'L', 'L', 'R', 'L', 'D', 'L', 'L', 'R', 'L', 'L', 'L', 'U', 'R', 'L', 'R', 'L', 'R', 'L', 'L', 'L', 'L', 'L', 'R', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'D', 'L', 'R', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :6200, The number of step:17\n", | |
" The sequence of action is: ['L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'D', 'L', 'U', 'U', 'R', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :6300, The number of step:5\n", | |
" The sequence of action is: ['R', 'U', 'U', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :6400, The number of step:16\n", | |
" The sequence of action is: ['L', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'D', 'D', 'L', 'L', 'L', 'L', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :6500, The number of step:6\n", | |
" The sequence of action is: ['L', 'L', 'U', 'D', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :6600, The number of step:25\n", | |
" The sequence of action is: ['L', 'U', 'L', 'L', 'R', 'R', 'R', 'L', 'L', 'U', 'U', 'U', 'R', 'R', 'L', 'L', 'L', 'L', 'L', 'D', 'U', 'R', 'U', 'D', 'D']\n", | |
"The total reward is: -1\n", | |
"\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"finished at [2, 1]\n", | |
"episode :6700, The number of step:7\n", | |
" The sequence of action is: ['L', 'L', 'L', 'D', 'L', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :6800, The number of step:37\n", | |
" The sequence of action is: ['L', 'L', 'U', 'R', 'D', 'U', 'R', 'R', 'L', 'R', 'R', 'U', 'D', 'D', 'L', 'D', 'U', 'D', 'R', 'R', 'L', 'L', 'L', 'D', 'U', 'R', 'R', 'R', 'L', 'L', 'L', 'D', 'U', 'D', 'D', 'U', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :6900, The number of step:17\n", | |
" The sequence of action is: ['L', 'L', 'L', 'U', 'L', 'L', 'L', 'R', 'L', 'U', 'L', 'L', 'D', 'D', 'L', 'L', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :7000, The number of step:24\n", | |
" The sequence of action is: ['D', 'D', 'L', 'D', 'D', 'L', 'L', 'L', 'L', 'U', 'L', 'R', 'D', 'D', 'R', 'U', 'R', 'R', 'L', 'L', 'D', 'U', 'L', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :7100, The number of step:96\n", | |
" The sequence of action is: ['L', 'R', 'U', 'U', 'U', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'R', 'U', 'U', 'U', 'U', 'U', 'L', 'L', 'L', 'L', 'R', 'D', 'U', 'U', 'U', 'L', 'L', 'L', 'R', 'U', 'U', 'L', 'D', 'D', 'L', 'D', 'L', 'D', 'D', 'L', 'U', 'D', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'U', 'L', 'L', 'L', 'U', 'U', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'R', 'U', 'L', 'L', 'L', 'U', 'L', 'L', 'D', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :7200, The number of step:13\n", | |
" The sequence of action is: ['U', 'L', 'L', 'L', 'L', 'D', 'U', 'L', 'R', 'U', 'U', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :7300, The number of step:6\n", | |
" The sequence of action is: ['L', 'R', 'U', 'U', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :7400, The number of step:11\n", | |
" The sequence of action is: ['L', 'L', 'L', 'R', 'D', 'U', 'L', 'L', 'D', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :7500, The number of step:39\n", | |
" The sequence of action is: ['R', 'U', 'L', 'R', 'L', 'L', 'L', 'R', 'D', 'L', 'L', 'L', 'R', 'U', 'L', 'R', 'L', 'L', 'L', 'U', 'D', 'R', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'R', 'L', 'U', 'R', 'R', 'R', 'D', 'L']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :7600, The number of step:37\n", | |
" The sequence of action is: ['R', 'R', 'L', 'R', 'L', 'U', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'R', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'R', 'L', 'L', 'L', 'D', 'L', 'L', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :7700, The number of step:19\n", | |
" The sequence of action is: ['L', 'L', 'R', 'U', 'L', 'L', 'L', 'D', 'U', 'L', 'R', 'L', 'L', 'D', 'L', 'L', 'L', 'R', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :7800, The number of step:13\n", | |
" The sequence of action is: ['L', 'U', 'U', 'R', 'L', 'L', 'R', 'L', 'L', 'R', 'U', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :7900, The number of step:34\n", | |
" The sequence of action is: ['D', 'L', 'D', 'L', 'L', 'D', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'U', 'L', 'D', 'L', 'L', 'L', 'R', 'D', 'R', 'L', 'U', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :8000, The number of step:6\n", | |
" The sequence of action is: ['L', 'R', 'D', 'U', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :8100, The number of step:28\n", | |
" The sequence of action is: ['L', 'U', 'U', 'D', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'D', 'U', 'D', 'L', 'L', 'L', 'U', 'L', 'L', 'U', 'L', 'L', 'L', 'D', 'L', 'L', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :8200, The number of step:7\n", | |
" The sequence of action is: ['L', 'L', 'D', 'D', 'L', 'L', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 2]\n", | |
"episode :8300, The number of step:57\n", | |
" The sequence of action is: ['L', 'U', 'L', 'L', 'L', 'L', 'L', 'U', 'D', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'R', 'U', 'U', 'U', 'U', 'D', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'R', 'R', 'U', 'D', 'R', 'D', 'R', 'L', 'D', 'U', 'L']\n", | |
"The total reward is: 1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :8400, The number of step:25\n", | |
" The sequence of action is: ['L', 'D', 'U', 'D', 'D', 'L', 'U', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'D', 'U', 'L', 'L', 'R', 'U', 'U', 'R', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :8500, The number of step:48\n", | |
" The sequence of action is: ['L', 'R', 'R', 'R', 'R', 'U', 'R', 'U', 'R', 'R', 'D', 'D', 'D', 'U', 'U', 'R', 'D', 'L', 'D', 'D', 'D', 'R', 'U', 'D', 'U', 'D', 'U', 'L', 'D', 'R', 'U', 'R', 'R', 'D', 'U', 'R', 'D', 'U', 'R', 'L', 'R', 'R', 'L', 'D', 'L', 'U', 'L', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :8600, The number of step:25\n", | |
" The sequence of action is: ['U', 'L', 'D', 'D', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'U', 'U', 'U', 'R', 'R', 'L', 'R', 'L', 'R', 'L', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :8700, The number of step:30\n", | |
" The sequence of action is: ['L', 'R', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'R', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'D', 'U', 'L', 'U', 'U', 'L', 'L', 'D', 'L', 'L', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :8800, The number of step:9\n", | |
" The sequence of action is: ['L', 'L', 'L', 'U', 'D', 'L', 'L', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :8900, The number of step:67\n", | |
" The sequence of action is: ['U', 'D', 'L', 'U', 'R', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'R', 'L', 'U', 'U', 'L', 'L', 'L', 'U', 'L', 'U', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'D', 'R', 'L', 'L', 'L', 'U', 'R', 'L', 'D', 'L', 'L', 'R', 'L', 'L', 'L', 'L', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :9000, The number of step:5\n", | |
" The sequence of action is: ['R', 'D', 'U', 'D', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :9100, The number of step:68\n", | |
" The sequence of action is: ['L', 'L', 'L', 'R', 'U', 'U', 'U', 'U', 'U', 'L', 'L', 'R', 'U', 'U', 'U', 'U', 'U', 'L', 'L', 'R', 'D', 'U', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'U', 'L', 'L', 'U', 'L', 'L', 'U', 'L', 'L', 'U', 'L', 'L', 'U', 'D', 'L', 'R', 'U', 'U', 'L', 'D', 'U', 'D', 'R', 'U', 'D', 'U', 'U', 'U', 'U', 'R', 'U', 'R', 'L', 'R', 'U', 'L', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :9200, The number of step:13\n", | |
" The sequence of action is: ['R', 'U', 'U', 'U', 'L', 'U', 'U', 'D', 'L', 'L', 'D', 'L', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :9300, The number of step:7\n", | |
" The sequence of action is: ['L', 'L', 'L', 'L', 'D', 'R', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :9400, The number of step:28\n", | |
" The sequence of action is: ['U', 'L', 'U', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'L', 'L', 'L', 'D', 'L', 'L', 'R', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :9500, The number of step:19\n", | |
" The sequence of action is: ['U', 'L', 'D', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'D', 'L', 'U', 'L', 'D', 'L', 'L', 'R', 'U']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :9600, The number of step:10\n", | |
" The sequence of action is: ['R', 'D', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [1, 2]\n", | |
"episode :9700, The number of step:50\n", | |
" The sequence of action is: ['U', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'U', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'R', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'U', 'L', 'L', 'R', 'R', 'R', 'L', 'U', 'D']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :9800, The number of step:6\n", | |
" The sequence of action is: ['D', 'L', 'L', 'D', 'L', 'R']\n", | |
"The total reward is: -1\n", | |
"\n", | |
"finished at [2, 1]\n", | |
"episode :9900, The number of step:8\n", | |
" The sequence of action is: ['L', 'L', 'U', 'L', 'D', 'L', 'D', 'R']\n", | |
"The total reward is: -1\n", | |
"\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The accuracy : 5.01 %\n" | |
] | |
} | |
], | |
"source": [ | |
"if __name__ == \"__main__\":\n", | |
" env = Env()\n", | |
" agent = MC_agent()\n", | |
" total_episode = 10000\n", | |
" sr=0\n", | |
" \n", | |
" for episode in range(total_episode):\n", | |
" action_sequence=[]\n", | |
" total_reward = 0\n", | |
" state = env.reset()\n", | |
" action = agent.get_action(state)\n", | |
" done = False\n", | |
" walk = 0\n", | |
" \n", | |
" while True:\n", | |
" next_state, reward, done = env.step(state, action)\n", | |
" agent.memorizer(state, reward, done)\n", | |
" agent.save_actionseq(action_sequence, action)\n", | |
" walk += 1\n", | |
" \n", | |
" # next state and action \n", | |
" state = next_state\n", | |
" action = agent.get_action(state)\n", | |
" total_reward+=reward\n", | |
" \n", | |
" if done:\n", | |
" if episode % 100 == 0 :\n", | |
" print('finished at', state)\n", | |
" print('episode :{}, The number of step:{}\\n The sequence of action is:\\\n", | |
" {}\\nThe total reward is: {}\\n'.format(episode, walk, action_sequence, total_reward))\n", | |
" if state == env.goal:\n", | |
" sr+=1\n", | |
" agent.update()\n", | |
" agent.memory.clear()\n", | |
" break\n", | |
" \n", | |
"print('The accuracy :', sr/total_episode*100, '%')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[-0.77451118, -0.81772992, -0.87861399, -0.75259688, -0.33052417],\n", | |
" [-0.80686915, -0.85519428, 0. , -0.72970452, -0.1901708 ],\n", | |
" [-0.78586143, 0. , 0. , 0.07326729, 0.07587984],\n", | |
" [-0.57704779, -0.7415952 , 0.12508784, 0.09977261, 0.15429315],\n", | |
" [-0.33355271, -0.1954709 , 0.0564361 , 0.07972603, 0.13846906]])" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"agent.value_table" | |
] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [default]", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment