Skip to content

Instantly share code, notes, and snippets.

@denny0323
Last active January 26, 2018 04:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save denny0323/9ae110047d81001a11eefb57d8f76819 to your computer and use it in GitHub Desktop.
Save denny0323/9ae110047d81001a11eefb57d8f76819 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# On-Policy Monte-Carlo Control"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"class Env:\n",
" def __init__(self):\n",
" self.grid_width = 5\n",
" self.grid_height = self.grid_width\n",
" self.action_grid = [(-1, 0), (1, 0), (0, -1), (0, 1)] # U, D, L, R\n",
" self.gtriangle1 = [1, 2]\n",
" self.gtriangle2 = [2, 1]\n",
" self.goal = [2, 2]\n",
" \n",
" def step(self, state, action):\n",
" x, y = state\n",
" \n",
" # get next state by action\n",
" x+= action[0]\n",
" y+= action[1]\n",
" \n",
" if x < 0 :\n",
" x = 0\n",
" elif x > (self.grid_width-1) :\n",
" x = (self.grid_width-1)\n",
"\n",
" if y < 0 :\n",
" y = 0\n",
" elif y > (self.grid_width-1) :\n",
" y = (self.grid_width-1)\n",
" \n",
" next_state = [x, y]\n",
" \n",
" # reward \n",
" if next_state == self.gtriangle1 or next_state == self.gtriangle2:\n",
" reward = -1\n",
" done = True\n",
" elif next_state == self.goal:\n",
" reward = 1\n",
" done = True\n",
" else:\n",
" reward = 0\n",
" done = False\n",
" \n",
" return next_state, reward, done\n",
" \n",
" def reset(self):\n",
" return [0, 0]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"class MC_agent:\n",
" def __init__(self):\n",
" self.action_grid = [(-1, 0), (1, 0), (0, -1), (0, 1)]\n",
" self.action_text= ['U', 'D', 'L', 'R']\n",
" self.grid_width = 5\n",
" self.grid_height = self.grid_width\n",
" self.value_table = np.zeros((self.grid_width, self.grid_height))\n",
" self.e = .1\n",
" self.learning_rate = .01\n",
" self.discount_factor = .95\n",
" self.memory=[]\n",
" \n",
" def get_action(self, state):\n",
" # with prob.ε take random action\n",
" if np.random.randn() < self.e :\n",
" idx = np.random.choice(len(self.action_grid),1)[0]\n",
" else :\n",
" next_values = np.array([])\n",
" for s in self.next_states(state):\n",
" next_values= np.append(next_values, self.value_table[tuple(s)])\n",
" max_value = np.amax(next_values)\n",
" tie_Qchecker = np.where(next_values==max_value)[0]\n",
" \n",
" # if tie max value, get random\n",
" if len(tie_Qchecker) > 1:\n",
" idx = np.random.choice(tie_Qchecker, 1)[0]\n",
" else :\n",
" idx = np.argmax(next_values)\n",
" action = self.action_grid[idx]\n",
" return action\n",
" \n",
" def next_states(self, state):\n",
" x, y = state\n",
" next_S = []\n",
" for action in self.action_grid:\n",
" # calculate x_coordinate\n",
" x+=action[0]\n",
" if x < 0:\n",
" x = 0\n",
" elif x > 4:\n",
" x = 4 \n",
" # calculate x_coordinate\n",
" y+=action[1]\n",
" if x < 0:\n",
" x = 0\n",
" elif x > 4:\n",
" x = 4\n",
" next_S.append([x, y]) \n",
" return next_S \n",
" \n",
" # using First visit MC \n",
" def update(self):\n",
" G_t = 0\n",
" visit_states=[]\n",
" for sample in reversed(self.memory):\n",
" state = sample[0]\n",
" reward = sample[1]\n",
" if state not in visit_states:\n",
" visit_states.append(state)\n",
" G_t = reward + self.discount_factor*G_t\n",
" V_t = self.value_table[tuple(state)]\n",
" # update Value\n",
" self.value_table[tuple(state)] = V_t + self.learning_rate*(G_t - V_t)\n",
" \n",
" def memorizer(self, state, reward, done):\n",
" self.memory.append([state, reward, done])\n",
" \n",
" def save_actionseq(self, action_sequence, action):\n",
" idx = self.action_grid.index(action)\n",
" action_sequence.append(self.action_text[idx])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"finished at [1, 2]\n",
"episode :0, The number of step:42\n",
" The sequence of action is: ['R', 'U', 'L', 'R', 'R', 'L', 'L', 'U', 'R', 'U', 'U', 'L', 'R', 'L', 'L', 'U', 'R', 'D', 'U', 'U', 'D', 'U', 'R', 'L', 'L', 'R', 'R', 'L', 'R', 'R', 'R', 'D', 'R', 'R', 'U', 'D', 'R', 'U', 'U', 'L', 'L', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :100, The number of step:15\n",
" The sequence of action is: ['L', 'L', 'L', 'L', 'L', 'L', 'D', 'U', 'L', 'D', 'R', 'U', 'U', 'D', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :200, The number of step:70\n",
" The sequence of action is: ['L', 'L', 'L', 'U', 'L', 'D', 'U', 'U', 'L', 'L', 'L', 'R', 'L', 'D', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'D', 'U', 'D', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'U', 'L', 'D', 'D', 'L', 'L', 'L', 'L', 'U', 'D', 'L', 'L', 'U', 'L', 'R', 'L', 'U', 'L', 'L', 'L', 'L', 'R', 'L', 'L', 'D', 'U', 'L', 'L', 'R', 'L', 'D', 'L', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :300, The number of step:10\n",
" The sequence of action is: ['L', 'L', 'L', 'D', 'L', 'R', 'L', 'L', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :400, The number of step:10\n",
" The sequence of action is: ['L', 'R', 'D', 'U', 'U', 'U', 'R', 'R', 'D', 'L']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :500, The number of step:9\n",
" The sequence of action is: ['L', 'L', 'U', 'L', 'L', 'D', 'L', 'R', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :600, The number of step:46\n",
" The sequence of action is: ['L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'D', 'L', 'U', 'L', 'L', 'R', 'U', 'U', 'U', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'D', 'R', 'D', 'U', 'R', 'D', 'L', 'D', 'D', 'L', 'U', 'R', 'D', 'U', 'U', 'U', 'L']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :700, The number of step:7\n",
" The sequence of action is: ['D', 'L', 'R', 'L', 'L', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :800, The number of step:49\n",
" The sequence of action is: ['L', 'L', 'U', 'R', 'L', 'R', 'L', 'L', 'L', 'R', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'L', 'R', 'L', 'R', 'U', 'L', 'U', 'U', 'U', 'L', 'L', 'D', 'U', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :900, The number of step:19\n",
" The sequence of action is: ['L', 'L', 'L', 'R', 'L', 'R', 'L', 'R', 'U', 'D', 'U', 'U', 'L', 'L', 'L', 'U', 'D', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :1000, The number of step:40\n",
" The sequence of action is: ['L', 'L', 'L', 'R', 'L', 'L', 'L', 'L', 'D', 'U', 'D', 'U', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'U', 'L', 'D', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :1100, The number of step:10\n",
" The sequence of action is: ['L', 'L', 'U', 'L', 'L', 'U', 'D', 'L', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :1200, The number of step:35\n",
" The sequence of action is: ['L', 'U', 'L', 'L', 'L', 'L', 'L', 'U', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'U', 'D', 'U', 'U', 'L', 'L', 'U', 'D', 'L', 'D', 'D', 'R', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :1300, The number of step:27\n",
" The sequence of action is: ['L', 'D', 'L', 'L', 'U', 'L', 'U', 'U', 'L', 'L', 'R', 'D', 'L', 'L', 'U', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'U', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :1400, The number of step:27\n",
" The sequence of action is: ['U', 'D', 'D', 'L', 'U', 'D', 'L', 'D', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'U', 'U', 'R', 'U', 'L', 'L', 'U', 'R', 'U', 'R', 'U', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :1500, The number of step:10\n",
" The sequence of action is: ['L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :1600, The number of step:19\n",
" The sequence of action is: ['D', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'U', 'L', 'L', 'L', 'U', 'L', 'R', 'D', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :1700, The number of step:7\n",
" The sequence of action is: ['L', 'U', 'D', 'L', 'D', 'L', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :1800, The number of step:35\n",
" The sequence of action is: ['L', 'L', 'L', 'U', 'R', 'L', 'R', 'L', 'U', 'L', 'D', 'L', 'L', 'L', 'L', 'R', 'L', 'R', 'U', 'L', 'L', 'U', 'U', 'L', 'L', 'D', 'U', 'D', 'R', 'L', 'R', 'L', 'L', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :1900, The number of step:7\n",
" The sequence of action is: ['L', 'U', 'L', 'R', 'U', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :2000, The number of step:18\n",
" The sequence of action is: ['L', 'D', 'L', 'R', 'L', 'L', 'L', 'L', 'L', 'R', 'L', 'L', 'U', 'L', 'D', 'L', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :2100, The number of step:4\n",
" The sequence of action is: ['U', 'R', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :2200, The number of step:8\n",
" The sequence of action is: ['U', 'L', 'L', 'D', 'D', 'L', 'L', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :2300, The number of step:4\n",
" The sequence of action is: ['D', 'D', 'L', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :2400, The number of step:11\n",
" The sequence of action is: ['L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :2500, The number of step:47\n",
" The sequence of action is: ['L', 'L', 'L', 'L', 'D', 'L', 'R', 'U', 'L', 'U', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'D', 'L', 'U', 'D', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'R', 'L', 'L', 'R', 'L', 'R', 'D', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :2600, The number of step:3\n",
" The sequence of action is: ['R', 'D', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :2700, The number of step:19\n",
" The sequence of action is: ['L', 'U', 'U', 'L', 'R', 'U', 'L', 'U', 'R', 'R', 'R', 'R', 'D', 'U', 'L', 'L', 'R', 'L', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :2800, The number of step:29\n",
" The sequence of action is: ['L', 'L', 'L', 'R', 'U', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'R', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :2900, The number of step:11\n",
" The sequence of action is: ['D', 'D', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'R', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :3000, The number of step:13\n",
" The sequence of action is: ['U', 'L', 'L', 'D', 'U', 'L', 'L', 'D', 'U', 'L', 'R', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :3100, The number of step:35\n",
" The sequence of action is: ['L', 'D', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'R', 'U', 'U', 'D', 'U', 'D', 'U', 'L', 'U', 'L', 'U', 'L', 'R', 'L', 'D', 'L', 'L', 'L', 'U', 'D', 'R', 'U', 'U', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :3200, The number of step:30\n",
" The sequence of action is: ['D', 'L', 'U', 'D', 'R', 'L', 'D', 'L', 'D', 'L', 'U', 'L', 'D', 'L', 'U', 'D', 'R', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :3300, The number of step:5\n",
" The sequence of action is: ['L', 'U', 'R', 'D', 'D']\n",
"The total reward is: -1\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"finished at [1, 2]\n",
"episode :3400, The number of step:8\n",
" The sequence of action is: ['L', 'L', 'L', 'L', 'L', 'D', 'R', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :3500, The number of step:9\n",
" The sequence of action is: ['R', 'R', 'U', 'R', 'U', 'L', 'U', 'U', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :3600, The number of step:68\n",
" The sequence of action is: ['L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'L', 'L', 'R', 'L', 'U', 'L', 'L', 'L', 'L', 'D', 'D', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'U', 'U', 'U', 'L', 'L', 'U', 'R', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'D', 'L', 'L', 'R', 'L', 'L', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :3700, The number of step:12\n",
" The sequence of action is: ['L', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'U', 'D', 'R', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :3800, The number of step:82\n",
" The sequence of action is: ['D', 'R', 'L', 'R', 'U', 'U', 'L', 'U', 'U', 'U', 'L', 'L', 'L', 'L', 'L', 'R', 'L', 'L', 'L', 'L', 'L', 'R', 'D', 'U', 'L', 'L', 'U', 'L', 'L', 'R', 'L', 'D', 'L', 'U', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'U', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'U', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'L', 'L', 'D', 'L', 'L', 'U', 'L', 'U', 'D', 'L', 'U', 'U', 'R', 'L', 'D', 'L', 'L', 'U', 'D', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :3900, The number of step:16\n",
" The sequence of action is: ['D', 'L', 'L', 'L', 'D', 'U', 'D', 'U', 'U', 'R', 'L', 'D', 'D', 'L', 'L', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :4000, The number of step:4\n",
" The sequence of action is: ['U', 'R', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :4100, The number of step:20\n",
" The sequence of action is: ['L', 'L', 'U', 'L', 'U', 'L', 'D', 'L', 'L', 'L', 'U', 'U', 'D', 'D', 'L', 'D', 'D', 'U', 'U', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :4200, The number of step:6\n",
" The sequence of action is: ['R', 'L', 'L', 'R', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :4300, The number of step:9\n",
" The sequence of action is: ['L', 'L', 'L', 'L', 'L', 'L', 'D', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :4400, The number of step:9\n",
" The sequence of action is: ['D', 'L', 'D', 'D', 'L', 'R', 'L', 'R', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :4500, The number of step:23\n",
" The sequence of action is: ['L', 'R', 'L', 'D', 'L', 'L', 'R', 'U', 'L', 'R', 'U', 'L', 'U', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :4600, The number of step:42\n",
" The sequence of action is: ['D', 'L', 'U', 'L', 'U', 'L', 'L', 'U', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'R', 'U', 'L', 'R', 'L', 'L', 'L', 'U', 'R', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :4700, The number of step:20\n",
" The sequence of action is: ['L', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'U', 'R', 'L', 'L', 'L', 'L', 'L', 'D', 'D', 'L', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :4800, The number of step:34\n",
" The sequence of action is: ['L', 'L', 'L', 'L', 'D', 'D', 'L', 'L', 'U', 'U', 'U', 'D', 'L', 'L', 'L', 'U', 'R', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'D', 'L', 'D', 'U', 'L', 'L', 'L', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :4900, The number of step:9\n",
" The sequence of action is: ['D', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :5000, The number of step:19\n",
" The sequence of action is: ['D', 'L', 'R', 'U', 'D', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'R', 'R', 'L', 'U', 'R', 'L', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :5100, The number of step:29\n",
" The sequence of action is: ['D', 'L', 'L', 'L', 'L', 'U', 'D', 'L', 'L', 'L', 'D', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'R', 'R', 'U', 'R', 'U', 'L']\n",
"The total reward is: 1\n",
"\n",
"finished at [2, 1]\n",
"episode :5200, The number of step:12\n",
" The sequence of action is: ['L', 'L', 'D', 'D', 'D', 'D', 'D', 'L', 'L', 'U', 'R', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :5300, The number of step:45\n",
" The sequence of action is: ['L', 'L', 'L', 'R', 'L', 'R', 'D', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'D', 'L', 'L', 'D', 'U', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'U', 'L', 'D', 'D', 'L', 'L', 'D', 'L', 'R', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :5400, The number of step:39\n",
" The sequence of action is: ['L', 'D', 'L', 'U', 'L', 'L', 'L', 'R', 'U', 'U', 'L', 'R', 'L', 'L', 'L', 'R', 'L', 'L', 'U', 'D', 'L', 'L', 'L', 'L', 'D', 'D', 'L', 'U', 'L', 'L', 'D', 'D', 'L', 'L', 'L', 'U', 'L', 'U', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :5500, The number of step:7\n",
" The sequence of action is: ['U', 'L', 'R', 'U', 'U', 'D', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :5600, The number of step:19\n",
" The sequence of action is: ['U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :5700, The number of step:16\n",
" The sequence of action is: ['L', 'L', 'R', 'L', 'R', 'L', 'L', 'L', 'L', 'L', 'D', 'U', 'L', 'R', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :5800, The number of step:3\n",
" The sequence of action is: ['R', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :5900, The number of step:41\n",
" The sequence of action is: ['L', 'L', 'L', 'L', 'R', 'L', 'L', 'L', 'L', 'R', 'L', 'U', 'U', 'U', 'D', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'U', 'U', 'U', 'L', 'L', 'U', 'L', 'R', 'R', 'R', 'L', 'R', 'U', 'L', 'R', 'L', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :6000, The number of step:46\n",
" The sequence of action is: ['L', 'L', 'D', 'L', 'L', 'R', 'L', 'U', 'L', 'U', 'D', 'R', 'U', 'R', 'L', 'U', 'U', 'U', 'D', 'U', 'L', 'L', 'L', 'L', 'L', 'D', 'D', 'U', 'L', 'L', 'U', 'L', 'L', 'D', 'L', 'R', 'U', 'U', 'U', 'L', 'L', 'R', 'U', 'U', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :6100, The number of step:50\n",
" The sequence of action is: ['L', 'L', 'D', 'R', 'U', 'L', 'L', 'D', 'R', 'L', 'U', 'L', 'R', 'L', 'L', 'R', 'L', 'D', 'L', 'L', 'R', 'L', 'L', 'L', 'U', 'R', 'L', 'R', 'L', 'R', 'L', 'L', 'L', 'L', 'L', 'R', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'D', 'L', 'R', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :6200, The number of step:17\n",
" The sequence of action is: ['L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'D', 'L', 'U', 'U', 'R', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :6300, The number of step:5\n",
" The sequence of action is: ['R', 'U', 'U', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :6400, The number of step:16\n",
" The sequence of action is: ['L', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'D', 'D', 'L', 'L', 'L', 'L', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :6500, The number of step:6\n",
" The sequence of action is: ['L', 'L', 'U', 'D', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :6600, The number of step:25\n",
" The sequence of action is: ['L', 'U', 'L', 'L', 'R', 'R', 'R', 'L', 'L', 'U', 'U', 'U', 'R', 'R', 'L', 'L', 'L', 'L', 'L', 'D', 'U', 'R', 'U', 'D', 'D']\n",
"The total reward is: -1\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"finished at [2, 1]\n",
"episode :6700, The number of step:7\n",
" The sequence of action is: ['L', 'L', 'L', 'D', 'L', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :6800, The number of step:37\n",
" The sequence of action is: ['L', 'L', 'U', 'R', 'D', 'U', 'R', 'R', 'L', 'R', 'R', 'U', 'D', 'D', 'L', 'D', 'U', 'D', 'R', 'R', 'L', 'L', 'L', 'D', 'U', 'R', 'R', 'R', 'L', 'L', 'L', 'D', 'U', 'D', 'D', 'U', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :6900, The number of step:17\n",
" The sequence of action is: ['L', 'L', 'L', 'U', 'L', 'L', 'L', 'R', 'L', 'U', 'L', 'L', 'D', 'D', 'L', 'L', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :7000, The number of step:24\n",
" The sequence of action is: ['D', 'D', 'L', 'D', 'D', 'L', 'L', 'L', 'L', 'U', 'L', 'R', 'D', 'D', 'R', 'U', 'R', 'R', 'L', 'L', 'D', 'U', 'L', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :7100, The number of step:96\n",
" The sequence of action is: ['L', 'R', 'U', 'U', 'U', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'R', 'U', 'U', 'U', 'U', 'U', 'L', 'L', 'L', 'L', 'R', 'D', 'U', 'U', 'U', 'L', 'L', 'L', 'R', 'U', 'U', 'L', 'D', 'D', 'L', 'D', 'L', 'D', 'D', 'L', 'U', 'D', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'U', 'L', 'L', 'L', 'U', 'U', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'R', 'U', 'L', 'L', 'L', 'U', 'L', 'L', 'D', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :7200, The number of step:13\n",
" The sequence of action is: ['U', 'L', 'L', 'L', 'L', 'D', 'U', 'L', 'R', 'U', 'U', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :7300, The number of step:6\n",
" The sequence of action is: ['L', 'R', 'U', 'U', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :7400, The number of step:11\n",
" The sequence of action is: ['L', 'L', 'L', 'R', 'D', 'U', 'L', 'L', 'D', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :7500, The number of step:39\n",
" The sequence of action is: ['R', 'U', 'L', 'R', 'L', 'L', 'L', 'R', 'D', 'L', 'L', 'L', 'R', 'U', 'L', 'R', 'L', 'L', 'L', 'U', 'D', 'R', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'R', 'L', 'U', 'R', 'R', 'R', 'D', 'L']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :7600, The number of step:37\n",
" The sequence of action is: ['R', 'R', 'L', 'R', 'L', 'U', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'R', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'R', 'L', 'L', 'L', 'D', 'L', 'L', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :7700, The number of step:19\n",
" The sequence of action is: ['L', 'L', 'R', 'U', 'L', 'L', 'L', 'D', 'U', 'L', 'R', 'L', 'L', 'D', 'L', 'L', 'L', 'R', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :7800, The number of step:13\n",
" The sequence of action is: ['L', 'U', 'U', 'R', 'L', 'L', 'R', 'L', 'L', 'R', 'U', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :7900, The number of step:34\n",
" The sequence of action is: ['D', 'L', 'D', 'L', 'L', 'D', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'L', 'U', 'L', 'D', 'L', 'L', 'L', 'R', 'D', 'R', 'L', 'U', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :8000, The number of step:6\n",
" The sequence of action is: ['L', 'R', 'D', 'U', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :8100, The number of step:28\n",
" The sequence of action is: ['L', 'U', 'U', 'D', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'D', 'U', 'D', 'L', 'L', 'L', 'U', 'L', 'L', 'U', 'L', 'L', 'L', 'D', 'L', 'L', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :8200, The number of step:7\n",
" The sequence of action is: ['L', 'L', 'D', 'D', 'L', 'L', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 2]\n",
"episode :8300, The number of step:57\n",
" The sequence of action is: ['L', 'U', 'L', 'L', 'L', 'L', 'L', 'U', 'D', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'R', 'U', 'U', 'U', 'U', 'D', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'R', 'R', 'U', 'D', 'R', 'D', 'R', 'L', 'D', 'U', 'L']\n",
"The total reward is: 1\n",
"\n",
"finished at [1, 2]\n",
"episode :8400, The number of step:25\n",
" The sequence of action is: ['L', 'D', 'U', 'D', 'D', 'L', 'U', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'D', 'D', 'U', 'L', 'L', 'R', 'U', 'U', 'R', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :8500, The number of step:48\n",
" The sequence of action is: ['L', 'R', 'R', 'R', 'R', 'U', 'R', 'U', 'R', 'R', 'D', 'D', 'D', 'U', 'U', 'R', 'D', 'L', 'D', 'D', 'D', 'R', 'U', 'D', 'U', 'D', 'U', 'L', 'D', 'R', 'U', 'R', 'R', 'D', 'U', 'R', 'D', 'U', 'R', 'L', 'R', 'R', 'L', 'D', 'L', 'U', 'L', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :8600, The number of step:25\n",
" The sequence of action is: ['U', 'L', 'D', 'D', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'U', 'U', 'U', 'R', 'R', 'L', 'R', 'L', 'R', 'L', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :8700, The number of step:30\n",
" The sequence of action is: ['L', 'R', 'L', 'L', 'L', 'L', 'L', 'D', 'L', 'R', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'D', 'U', 'L', 'U', 'U', 'L', 'L', 'D', 'L', 'L', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :8800, The number of step:9\n",
" The sequence of action is: ['L', 'L', 'L', 'U', 'D', 'L', 'L', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :8900, The number of step:67\n",
" The sequence of action is: ['U', 'D', 'L', 'U', 'R', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'R', 'L', 'U', 'U', 'L', 'L', 'L', 'U', 'L', 'U', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'D', 'R', 'L', 'L', 'L', 'U', 'R', 'L', 'D', 'L', 'L', 'R', 'L', 'L', 'L', 'L', 'D', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :9000, The number of step:5\n",
" The sequence of action is: ['R', 'D', 'U', 'D', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :9100, The number of step:68\n",
" The sequence of action is: ['L', 'L', 'L', 'R', 'U', 'U', 'U', 'U', 'U', 'L', 'L', 'R', 'U', 'U', 'U', 'U', 'U', 'L', 'L', 'R', 'D', 'U', 'U', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'U', 'L', 'L', 'U', 'L', 'L', 'U', 'L', 'L', 'U', 'L', 'L', 'U', 'D', 'L', 'R', 'U', 'U', 'L', 'D', 'U', 'D', 'R', 'U', 'D', 'U', 'U', 'U', 'U', 'R', 'U', 'R', 'L', 'R', 'U', 'L', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :9200, The number of step:13\n",
" The sequence of action is: ['R', 'U', 'U', 'U', 'L', 'U', 'U', 'D', 'L', 'L', 'D', 'L', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :9300, The number of step:7\n",
" The sequence of action is: ['L', 'L', 'L', 'L', 'D', 'R', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :9400, The number of step:28\n",
" The sequence of action is: ['U', 'L', 'U', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'L', 'L', 'L', 'D', 'L', 'L', 'R', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :9500, The number of step:19\n",
" The sequence of action is: ['U', 'L', 'D', 'L', 'L', 'D', 'L', 'L', 'L', 'L', 'D', 'L', 'U', 'L', 'D', 'L', 'L', 'R', 'U']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :9600, The number of step:10\n",
" The sequence of action is: ['R', 'D', 'L', 'L', 'L', 'L', 'D', 'L', 'L', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [1, 2]\n",
"episode :9700, The number of step:50\n",
" The sequence of action is: ['U', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'U', 'D', 'L', 'L', 'L', 'L', 'L', 'L', 'U', 'L', 'L', 'R', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'R', 'U', 'L', 'L', 'R', 'R', 'R', 'L', 'U', 'D']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :9800, The number of step:6\n",
" The sequence of action is: ['D', 'L', 'L', 'D', 'L', 'R']\n",
"The total reward is: -1\n",
"\n",
"finished at [2, 1]\n",
"episode :9900, The number of step:8\n",
" The sequence of action is: ['L', 'L', 'U', 'L', 'D', 'L', 'D', 'R']\n",
"The total reward is: -1\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The accuracy : 5.01 %\n"
]
}
],
"source": [
"if __name__ == \"__main__\":\n",
" env = Env()\n",
" agent = MC_agent()\n",
" total_episode = 10000\n",
" sr=0\n",
" \n",
" for episode in range(total_episode):\n",
" action_sequence=[]\n",
" total_reward = 0\n",
" state = env.reset()\n",
" action = agent.get_action(state)\n",
" done = False\n",
" walk = 0\n",
" \n",
" while True:\n",
" next_state, reward, done = env.step(state, action)\n",
" agent.memorizer(state, reward, done)\n",
" agent.save_actionseq(action_sequence, action)\n",
" walk += 1\n",
" \n",
" # next state and action \n",
" state = next_state\n",
" action = agent.get_action(state)\n",
" total_reward+=reward\n",
" \n",
" if done:\n",
" if episode % 100 == 0 :\n",
" print('finished at', state)\n",
" print('episode :{}, The number of step:{}\\n The sequence of action is:\\\n",
" {}\\nThe total reward is: {}\\n'.format(episode, walk, action_sequence, total_reward))\n",
" if state == env.goal:\n",
" sr+=1\n",
" agent.update()\n",
" agent.memory.clear()\n",
" break\n",
" \n",
"print('The accuracy :', sr/total_episode*100, '%')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[-0.77451118, -0.81772992, -0.87861399, -0.75259688, -0.33052417],\n",
" [-0.80686915, -0.85519428, 0. , -0.72970452, -0.1901708 ],\n",
" [-0.78586143, 0. , 0. , 0.07326729, 0.07587984],\n",
" [-0.57704779, -0.7415952 , 0.12508784, 0.09977261, 0.15429315],\n",
" [-0.33355271, -0.1954709 , 0.0564361 , 0.07972603, 0.13846906]])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agent.value_table"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment