Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save alessiot/58f23ee80bf80639a091b016436b8fbc to your computer and use it in GitHub Desktop.
Save alessiot/58f23ee80bf80639a091b016436b8fbc to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"toc": true
},
"source": [
"<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
"<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Coding-Battleship\" data-toc-modified-id=\"Coding-Battleship-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Coding Battleship</a></span></li><li><span><a href=\"#Callback-and-Plotting\" data-toc-modified-id=\"Callback-and-Plotting-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Callback and Plotting</a></span></li><li><span><a href=\"#Playing-with-One-Ship-on-a-5x5-board\" data-toc-modified-id=\"Playing-with-One-Ship-on-a-5x5-board-3\"><span class=\"toc-item-num\">3&nbsp;&nbsp;</span>Playing with One Ship on a 5x5 board</a></span></li><li><span><a href=\"#Playing-with-One-Ship-on-a-Bigger-Board\" data-toc-modified-id=\"Playing-with-One-Ship-on-a-Bigger-Board-4\"><span class=\"toc-item-num\">4&nbsp;&nbsp;</span>Playing with One Ship on a Bigger Board</a></span></li><li><span><a href=\"#Visualizing-How-the-Agent-Plays\" data-toc-modified-id=\"Visualizing-How-the-Agent-Plays-5\"><span class=\"toc-item-num\">5&nbsp;&nbsp;</span>Visualizing How the Agent Plays</a></span></li><li><span><a href=\"#Optimizing-The-Algorithm-Parameters-with-Hyperopt\" data-toc-modified-id=\"Optimizing-The-Algorithm-Parameters-with-Hyperopt-6\"><span class=\"toc-item-num\">6&nbsp;&nbsp;</span>Optimizing The Algorithm Parameters with Hyperopt</a></span></li><li><span><a href=\"#Links\" data-toc-modified-id=\"Links-7\"><span class=\"toc-item-num\">7&nbsp;&nbsp;</span>Links</a></span></li><li><span><a href=\"#Reward-scheme\" data-toc-modified-id=\"Reward-scheme-8\"><span class=\"toc-item-num\">8&nbsp;&nbsp;</span>Reward scheme</a></span></li><li><span><a href=\"#Skeleton-Battleship-Environmnt\" data-toc-modified-id=\"Skeleton-Battleship-Environmnt-9\"><span class=\"toc-item-num\">9&nbsp;&nbsp;</span>Skeleton Battleship Environmnt</a></span></li></ul></div>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Coding Battleship"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import gym\n",
"from gym import spaces\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# randomly places a ship on a board\n",
"def set_ship(ship, ships, board, ship_locs):\n",
"\n",
" grid_size = board.shape[0]\n",
" \n",
" done = False\n",
" while not done:\n",
" init_pos_i = np.random.randint(0, grid_size)\n",
" init_pos_j = np.random.randint(0, grid_size)\n",
" \n",
" # for a cruiser, if init_oos_i = 0, move forward horizontally (+1)\n",
" # for a cruiser, if init_oos_j = 0, move downward vertically (+1)\n",
" move_j = grid_size - init_pos_j - ships[ship]# horizontal\n",
" if move_j > 0:\n",
" move_j = 1\n",
" else:\n",
" move_j = -1\n",
" move_i = grid_size - init_pos_i - ships[ship] # vertical\n",
" if move_i > 0:\n",
" move_i = 1\n",
" else:\n",
" move_i = -1\n",
" # choose if placing ship horizontally or vertically\n",
" choice_hv = np.random.choice(['h', 'v']) # horizontal, vertical\n",
" if choice_hv == 'h': #horizontal\n",
" j = [(init_pos_j + move_j*jj) for jj in range(ships[ship])]\n",
" i = [init_pos_i for ii in range(ships[ship])]\n",
" pos = set(zip(i,j)) \n",
" if all([board[i,j]==0 for (i,j) in pos]):\n",
" done = True\n",
" elif choice_hv == 'v':\n",
" i = [(init_pos_i + move_i*ii) for ii in range(ships[ship])]\n",
" j = [init_pos_j for jj in range(ships[ship])]\n",
" pos = set(zip(i,j)) \n",
" #check if empty board in this direction\n",
" if all([board[i,j]==0 for (i,j) in pos]):\n",
" done = True\n",
" # set ship - see convention\n",
" for (i,j) in pos:\n",
" board[i,j] = 1\n",
" ship_locs[ship].append((i,j))\n",
" \n",
" return board, ship_locs\n",
"\n",
"def board_rendering(grid_size, board):\n",
" for i in range(grid_size):\n",
" print(\"-\"*(4*grid_size+2))\n",
" for j in range(grid_size):\n",
" current_state_value = board[i,j]\n",
" current_state = ('S' if current_state_value==1 else ' ')\n",
" print(\" | \", end=\"\")\n",
" print(current_state, end='')\n",
" print(' |')\n",
" print(\"-\"*(4*grid_size+2))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"class BattleshipEnv(gym.Env):\n",
" \n",
" \"\"\"Custom Environment that follows gym interface\"\"\"\n",
" \"\"\"see https://github.com/openai/gym/blob/master/gym/core.py\"\"\"\n",
" \n",
" metadata = {'render.modes': ['human']} \n",
"\n",
"\n",
" def __init__(self, enemy_board, ship_locs, grid_size, ships):\n",
" \n",
" super(BattleshipEnv, self).__init__()\n",
" \n",
" #ships\n",
" self.ships = ships\n",
" \n",
" # board size\n",
" self.grid_size = grid_size \n",
" # cell state encoding (empty, hit, miss)\n",
" self.cell = {'E': 0, 'X': 1, 'O': -1} \n",
" # boards, actions, rewards\n",
" self.board = self.cell['E']*np.ones((self.grid_size, self.grid_size), dtype='int')\n",
" # enemy_board must be encoded with 0: empy and 1: ship cell\n",
" self.is_enemy_set = False\n",
" self.enemy_board = enemy_board\n",
" self.ship_locs = ship_locs\n",
" if self.enemy_board is None:\n",
" self.enemy_board = 0*np.ones((self.grid_size, self.grid_size), dtype='int')\n",
" for ship in self.ships:\n",
" self.ship_locs[ship] = []\n",
" self.enemy_board, self.ship_locs = set_ship(ship, self.ships, self.enemy_board, self.ship_locs)\n",
" self.is_enemy_set = True\n",
" # reward discount\n",
" self.rdisc = 0\n",
" self.legal_actions = [] # legal (empty) cells available for moves\n",
" for i in range(self.grid_size):\n",
" for j in range(self.grid_size):\n",
" self.legal_actions.append((i,j))# this gets updated as an action is performed\n",
" \n",
" # Define action and observation space\n",
" # They must be gym.spaces objects\n",
" # In our case the action space is discrete: index of action\n",
" self.action_space = spaces.Discrete(self.grid_size * self.grid_size)\n",
" # The observation will be the state or configuration of the board\n",
" self.observation_space = spaces.Box(low=-1, high=1,shape=(self.grid_size, self.grid_size), \n",
" dtype=np.int)\n",
" #Ex: print(spaces.Box(0,1, shape=(10,10)).high)\n",
"\n",
" # action will be an index in action_space if from epsilon-greedy\n",
" # or from model prediction\n",
" def step(self, action):\n",
" \n",
" # board situation before the action\n",
" state = self.board.copy() \n",
" empty_cnts_pre, hit_cnts_pre, miss_cnts_pre = self.board_config(state)\n",
" \n",
" # action coordinates generated or predicted by the agent in the action_space\n",
" i, j = np.unravel_index(action, (self.grid_size,self.grid_size))\n",
" \n",
" #print('action', action, 'coords', i, j)\n",
" #print('legal_actions', self.legal_actions)\n",
" \n",
" # lose 1 point for any action\n",
" reward = -1\n",
" # assign a penalty for each illegal action used instead of a legal one\n",
" if (i,j) not in self.legal_actions:\n",
" reward -= 2*self.grid_size\n",
" action_idx = np.random.randint(0,len(self.legal_actions))\n",
" \n",
" i,j = self.legal_actions[action_idx] \n",
" action = np.ravel_multi_index((i,j), (self.grid_size,self.grid_size))\n",
" \n",
" # set new state after performing action (scoring board is updated)\n",
" self.set_state((i,j))\n",
" # update legal actions and action_space\n",
" self.set_legal_actions((i,j))\n",
"\n",
" # new state on scoring board - this includes last action\n",
" next_state = self.board\n",
" \n",
" # board situation after action\n",
" empty_cnts_post, hit_cnts_post, miss_cnts_post = self.board_config(next_state)\n",
"\n",
" # game completed?\n",
" done = bool(hit_cnts_post == sum(self.ships.values()))\n",
" \n",
" # reward for a hit\n",
" if hit_cnts_post-hit_cnts_pre==1: \n",
" # Update hit counts and use it to reward\n",
" r_discount = 1#0.5**self.rdisc\n",
" rp = (self.grid_size*self.grid_size if done else self.grid_size)\n",
" reward += rp*r_discount\n",
" #print('HIT!!!')\n",
" \n",
" #if done:\n",
" # print('done')\n",
" \n",
" # we discount the reward for a subsequent hit the longer it takes to score it\n",
" # after a hit, zero the discount \n",
" # don't start discounting though, if first hit hasn't happened yet\n",
" #if hit_cnts_post-hit_cnts_pre==1 or hit_cnts_pre==0:\n",
" # self.rdisc = 0\n",
" #else:\n",
" # self.rdisc += 1\n",
" \n",
" reward = float(reward)\n",
" \n",
" #print('reward:', reward)\n",
" # store the current value of the portfolio here\n",
" info = {}\n",
"\n",
" return next_state, reward, done, info\n",
"\n",
"\n",
" \n",
" def reset(self):\n",
" # Reset the state of the environment to an initial state\n",
" \"\"\"\n",
" Important: the observation must be a numpy array\n",
" :return: (np.array) \n",
" \"\"\"\n",
" \n",
" self.board = self.cell['E']*np.ones((self.grid_size, self.grid_size), dtype='int')\n",
" \n",
" self.legal_actions = [] # legal (empty) cells available for moves\n",
" for i in range(self.grid_size):\n",
" for j in range(self.grid_size):\n",
" self.legal_actions.append((i,j))# this gets updated as an action is performed\n",
" \n",
" # generate a random board again if it was set randomly before\n",
" if self.is_enemy_set:\n",
" self.enemy_board = 0*np.ones((self.grid_size, self.grid_size), dtype='int')\n",
" self.ship_locs = {}\n",
" for ship in self.ships:\n",
" self.ship_locs[ship] = []\n",
" self.enemy_board, self.ship_locs = set_ship(ship, self.ships, self.enemy_board, self.ship_locs)\n",
"\n",
" self.rdisc = 0\n",
"\n",
" return self.board\n",
" \n",
" # Render the environment to the screen\n",
" # board (i,j)\n",
" ## ------------>j\n",
" ## | (0,0) | (0,1) | (0,2) | |\n",
" ## | (1,0) | (1,1) | (1,2) | |\n",
" ## v i\n",
" def render(self, mode='human'):\n",
" for i in range(self.grid_size):\n",
" print(\"-\"*(4*self.grid_size+2))\n",
" for j in range(self.grid_size):\n",
" current_state_value = self.board[i,j]\n",
" current_state = list(self.cell.keys())[list(self.cell.values()).index(current_state_value)]\n",
" current_state = (current_state if current_state!='E' else ' ')\n",
" print(\" | \", end=\"\")\n",
" print(current_state, end='')\n",
" print(' |')\n",
" print(\"-\"*(4*self.grid_size+2))\n",
" \n",
" ####### HELPER FUNCTIONS ###########\n",
" \n",
" def board_config(self, state):\n",
" uni_states, uni_cnts = np.unique(state.ravel(), return_counts=True)\n",
" empty_cnts = uni_cnts[uni_states==self.cell['E']]\n",
" hit_cnts = uni_cnts[uni_states==self.cell['X']]\n",
" miss_cnts = uni_cnts[uni_states==self.cell['O']]\n",
" if len(empty_cnts)==0:\n",
" empty_cnts = 0\n",
" else:\n",
" empty_cnts = empty_cnts[0]\n",
" if len(hit_cnts)==0:\n",
" hit_cnts = 0\n",
" else:\n",
" hit_cnts = hit_cnts[0]\n",
" if len(miss_cnts)==0:\n",
" miss_cnts = 0\n",
" else:\n",
" miss_cnts = miss_cnts[0]\n",
" \n",
" return empty_cnts, hit_cnts, miss_cnts\n",
"\n",
" # set board configuration and state value after player action\n",
" def set_state(self, action):\n",
" i , j = action\n",
" if self.enemy_board[i,j]==1:\n",
" self.board[i,j]=self.cell['X']\n",
" else:\n",
" self.board[i,j]=self.cell['O']\n",
"\n",
" # set legal actions (empty board locations)\n",
" def set_legal_actions(self, action):\n",
" if action in self.legal_actions:\n",
" self.legal_actions.remove(action)\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:\n",
"The TensorFlow contrib module will not be included in TensorFlow 2.0.\n",
"For more information, please see:\n",
" * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n",
" * https://github.com/tensorflow/addons\n",
" * https://github.com/tensorflow/io (for I/O related ops)\n",
"If you depend on functionality not listed there, please file an issue.\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/atambu310/anaconda3/lib/python3.7/site-packages/stable_baselines/common/env_checker.py:202: UserWarning: Your observation has an unconventional shape (neither an image, nor a 1D vector). We recommend you to flatten the observation to have only a 1D vector\n",
" warnings.warn(\"Your observation has an unconventional shape (neither an image, nor a 1D vector). \"\n"
]
}
],
"source": [
"# validate environment with one ship (either random or user-defined) on 5x5 board\n",
"\n",
"from stable_baselines.common.env_checker import check_env\n",
"\n",
"# ships\n",
"ships = {}\n",
"ships['cruiser'] = 3\n",
"\n",
"grid_size = 5\n",
"# for pre-determined board\n",
"enemy_board = 0*np.ones((grid_size, grid_size), dtype='int')\n",
"enemy_board[0,1] = 1\n",
"enemy_board[1,1] = 1\n",
"enemy_board[2,1] = 1\n",
"ship_locs = {}\n",
"ship_locs['cruiser'] = [(0,1),(1,1),(2,1)]\n",
"env = BattleshipEnv(enemy_board=enemy_board, ship_locs=ship_locs, grid_size=grid_size, ships=ships)\n",
"# for random board\n",
"#env = BattleshipEnv(enemy_board=None, ship_locs={}, grid_size=grid_size)\n",
"# If the environment doesn't follow the interface, an error will be thrown\n",
"check_env(env, warn=True)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([[0, 1, 0, 0, 0],\n",
" [0, 1, 0, 0, 0],\n",
" [0, 1, 0, 0, 0],\n",
" [0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0]]), {'cruiser': [(0, 1), (1, 1), (2, 1)]})"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"env.enemy_board, env.ship_locs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Episode 0\n",
"Action 1 4 2\n",
"obs= [[ 0 0 0 0 0]\n",
" [ 0 0 0 0 0]\n",
" [ 0 0 0 0 0]\n",
" [ 0 0 0 0 0]\n",
" [ 0 0 -1 0 0]] reward= -1.0 done= False\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
"Action 2 1 0\n",
"obs= [[ 0 0 0 0 0]\n",
" [-1 0 0 0 0]\n",
" [ 0 0 0 0 0]\n",
" [ 0 0 0 0 0]\n",
" [ 0 0 -1 0 0]] reward= -1.0 done= False\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | O | | | | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
"Action 3 4 2\n",
"obs= [[ 0 0 0 0 0]\n",
" [-1 0 0 0 0]\n",
" [ 0 0 0 0 0]\n",
" [ 0 0 0 -1 0]\n",
" [ 0 0 -1 0 0]] reward= -11.0 done= False\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | O | | | | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | | | | O | |\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
"Action 4 3 3\n",
"obs= [[ 0 0 0 0 0]\n",
" [-1 0 0 0 0]\n",
" [ 0 0 -1 0 0]\n",
" [ 0 0 0 -1 0]\n",
" [ 0 0 -1 0 0]] reward= -11.0 done= False\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | O | | | | |\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
" | | | | O | |\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
"Action 5 1 4\n",
"obs= [[ 0 0 0 0 0]\n",
" [-1 0 0 0 -1]\n",
" [ 0 0 -1 0 0]\n",
" [ 0 0 0 -1 0]\n",
" [ 0 0 -1 0 0]] reward= -1.0 done= False\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | O | | | | O |\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
" | | | | O | |\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
"Action 6 2 2\n",
"obs= [[ 0 0 0 0 0]\n",
" [-1 0 0 0 -1]\n",
" [ 0 0 -1 0 0]\n",
" [ 0 1 0 -1 0]\n",
" [ 0 0 -1 0 0]] reward= -6.0 done= False\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | O | | | | O |\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
" | | X | | O | |\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
"Action 7 1 0\n",
"obs= [[ 0 0 0 0 0]\n",
" [-1 0 0 0 -1]\n",
" [ 0 0 -1 0 0]\n",
" [ 0 1 0 -1 0]\n",
" [ 0 -1 -1 0 0]] reward= -11.0 done= False\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | O | | | | O |\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
" | | X | | O | |\n",
"----------------------\n",
" | | O | O | | |\n",
"----------------------\n",
"Action 8 4 3\n",
"obs= [[ 0 0 0 0 0]\n",
" [-1 0 0 0 -1]\n",
" [ 0 0 -1 0 0]\n",
" [ 0 1 0 -1 0]\n",
" [ 0 -1 -1 -1 0]] reward= -1.0 done= False\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | O | | | | O |\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
" | | X | | O | |\n",
"----------------------\n",
" | | O | O | O | |\n",
"----------------------\n",
"Action 9 1 4\n",
"obs= [[ 0 0 -1 0 0]\n",
" [-1 0 0 0 -1]\n",
" [ 0 0 -1 0 0]\n",
" [ 0 1 0 -1 0]\n",
" [ 0 -1 -1 -1 0]] reward= -11.0 done= False\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
" | O | | | | O |\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
" | | X | | O | |\n",
"----------------------\n",
" | | O | O | O | |\n",
"----------------------\n",
"Action 10 4 2\n",
"obs= [[ 0 0 -1 0 0]\n",
" [-1 0 0 -1 -1]\n",
" [ 0 0 -1 0 0]\n",
" [ 0 1 0 -1 0]\n",
" [ 0 -1 -1 -1 0]] reward= -11.0 done= False\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
" | O | | | O | O |\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
" | | X | | O | |\n",
"----------------------\n",
" | | O | O | O | |\n",
"----------------------\n",
"Action 11 0 2\n",
"obs= [[ 0 0 -1 0 0]\n",
" [-1 0 0 -1 -1]\n",
" [ 0 0 -1 0 0]\n",
" [-1 1 0 -1 0]\n",
" [ 0 -1 -1 -1 0]] reward= -11.0 done= False\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
" | O | | | O | O |\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
" | O | X | | O | |\n",
"----------------------\n",
" | | O | O | O | |\n",
"----------------------\n",
"Action 12 0 3\n",
"obs= [[ 0 0 -1 -1 0]\n",
" [-1 0 0 -1 -1]\n",
" [ 0 0 -1 0 0]\n",
" [-1 1 0 -1 0]\n",
" [ 0 -1 -1 -1 0]] reward= -1.0 done= False\n",
"----------------------\n",
" | | | O | O | |\n",
"----------------------\n",
" | O | | | O | O |\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
" | O | X | | O | |\n",
"----------------------\n",
" | | O | O | O | |\n",
"----------------------\n",
"Action 13 1 4\n",
"obs= [[ 0 0 -1 -1 0]\n",
" [-1 0 0 -1 -1]\n",
" [ 0 0 -1 0 0]\n",
" [-1 1 -1 -1 0]\n",
" [ 0 -1 -1 -1 0]] reward= -11.0 done= False\n",
"----------------------\n",
" | | | O | O | |\n",
"----------------------\n",
" | O | | | O | O |\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
" | O | X | O | O | |\n",
"----------------------\n",
" | | O | O | O | |\n",
"----------------------\n",
"Action 14 2 0\n",
"obs= [[ 0 0 -1 -1 0]\n",
" [-1 0 0 -1 -1]\n",
" [-1 0 -1 0 0]\n",
" [-1 1 -1 -1 0]\n",
" [ 0 -1 -1 -1 0]] reward= -1.0 done= False\n",
"----------------------\n",
" | | | O | O | |\n",
"----------------------\n",
" | O | | | O | O |\n",
"----------------------\n",
" | O | | O | | |\n",
"----------------------\n",
" | O | X | O | O | |\n",
"----------------------\n",
" | | O | O | O | |\n",
"----------------------\n",
"Action 15 2 2\n",
"obs= [[ 0 -1 -1 -1 0]\n",
" [-1 0 0 -1 -1]\n",
" [-1 0 -1 0 0]\n",
" [-1 1 -1 -1 0]\n",
" [ 0 -1 -1 -1 0]] reward= -11.0 done= False\n",
"----------------------\n",
" | | O | O | O | |\n",
"----------------------\n",
" | O | | | O | O |\n",
"----------------------\n",
" | O | | O | | |\n",
"----------------------\n",
" | O | X | O | O | |\n",
"----------------------\n",
" | | O | O | O | |\n",
"----------------------\n",
"Action 16 0 0\n",
"obs= [[-1 -1 -1 -1 0]\n",
" [-1 0 0 -1 -1]\n",
" [-1 0 -1 0 0]\n",
" [-1 1 -1 -1 0]\n",
" [ 0 -1 -1 -1 0]] reward= -1.0 done= False\n",
"----------------------\n",
" | O | O | O | O | |\n",
"----------------------\n",
" | O | | | O | O |\n",
"----------------------\n",
" | O | | O | | |\n",
"----------------------\n",
" | O | X | O | O | |\n",
"----------------------\n",
" | | O | O | O | |\n",
"----------------------\n",
"Action 17 0 3\n",
"obs= [[-1 -1 -1 -1 -1]\n",
" [-1 0 0 -1 -1]\n",
" [-1 0 -1 0 0]\n",
" [-1 1 -1 -1 0]\n",
" [ 0 -1 -1 -1 0]] reward= -11.0 done= False\n",
"----------------------\n",
" | O | O | O | O | O |\n",
"----------------------\n",
" | O | | | O | O |\n",
"----------------------\n",
" | O | | O | | |\n",
"----------------------\n",
" | O | X | O | O | |\n",
"----------------------\n",
" | | O | O | O | |\n",
"----------------------\n",
"Action 18 2 4\n",
"obs= [[-1 -1 -1 -1 -1]\n",
" [-1 0 0 -1 -1]\n",
" [-1 0 -1 0 -1]\n",
" [-1 1 -1 -1 0]\n",
" [ 0 -1 -1 -1 0]] reward= -1.0 done= False\n",
"----------------------\n",
" | O | O | O | O | O |\n",
"----------------------\n",
" | O | | | O | O |\n",
"----------------------\n",
" | O | | O | | O |\n",
"----------------------\n",
" | O | X | O | O | |\n",
"----------------------\n",
" | | O | O | O | |\n",
"----------------------\n",
"Action 19 4 1\n",
"obs= [[-1 -1 -1 -1 -1]\n",
" [-1 0 0 -1 -1]\n",
" [-1 0 -1 0 -1]\n",
" [-1 1 -1 -1 0]\n",
" [ 0 -1 -1 -1 -1]] reward= -11.0 done= False\n",
"----------------------\n",
" | O | O | O | O | O |\n",
"----------------------\n",
" | O | | | O | O |\n",
"----------------------\n",
" | O | | O | | O |\n",
"----------------------\n",
" | O | X | O | O | |\n",
"----------------------\n",
" | | O | O | O | O |\n",
"----------------------\n",
"Action 20 4 4\n",
"obs= [[-1 -1 -1 -1 -1]\n",
" [-1 0 0 -1 -1]\n",
" [-1 0 -1 -1 -1]\n",
" [-1 1 -1 -1 0]\n",
" [ 0 -1 -1 -1 -1]] reward= -11.0 done= False\n",
"----------------------\n",
" | O | O | O | O | O |\n",
"----------------------\n",
" | O | | | O | O |\n",
"----------------------\n",
" | O | | O | O | O |\n",
"----------------------\n",
" | O | X | O | O | |\n",
"----------------------\n",
" | | O | O | O | O |\n",
"----------------------\n",
"Action 21 1 2\n",
"obs= [[-1 -1 -1 -1 -1]\n",
" [-1 0 -1 -1 -1]\n",
" [-1 0 -1 -1 -1]\n",
" [-1 1 -1 -1 0]\n",
" [ 0 -1 -1 -1 -1]] reward= -1.0 done= False\n",
"----------------------\n",
" | O | O | O | O | O |\n",
"----------------------\n",
" | O | | O | O | O |\n",
"----------------------\n",
" | O | | O | O | O |\n",
"----------------------\n",
" | O | X | O | O | |\n",
"----------------------\n",
" | | O | O | O | O |\n",
"----------------------\n",
"Action 22 1 2\n",
"obs= [[-1 -1 -1 -1 -1]\n",
" [-1 1 -1 -1 -1]\n",
" [-1 0 -1 -1 -1]\n",
" [-1 1 -1 -1 0]\n",
" [ 0 -1 -1 -1 -1]] reward= -6.0 done= False\n",
"----------------------\n",
" | O | O | O | O | O |\n",
"----------------------\n",
" | O | X | O | O | O |\n",
"----------------------\n",
" | O | | O | O | O |\n",
"----------------------\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" | O | X | O | O | |\n",
"----------------------\n",
" | | O | O | O | O |\n",
"----------------------\n",
"Action 23 2 3\n",
"obs= [[-1 -1 -1 -1 -1]\n",
" [-1 1 -1 -1 -1]\n",
" [-1 1 -1 -1 -1]\n",
" [-1 1 -1 -1 0]\n",
" [ 0 -1 -1 -1 -1]] reward= 14.0 done= True\n",
"----------------------\n",
" | O | O | O | O | O |\n",
"----------------------\n",
" | O | X | O | O | O |\n",
"----------------------\n",
" | O | X | O | O | O |\n",
"----------------------\n",
" | O | X | O | O | |\n",
"----------------------\n",
" | | O | O | O | O |\n",
"----------------------\n",
"Goal reached! reward= 14.0\n",
"Episode 1\n",
"Action 1 1 0\n",
"obs= [[ 0 0 0 0 0]\n",
" [-1 0 0 0 0]\n",
" [ 0 0 0 0 0]\n",
" [ 0 0 0 0 0]\n",
" [ 0 0 0 0 0]] reward= -1.0 done= False\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | O | | | | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
"Action 2 1 0\n",
"obs= [[ 0 0 0 0 0]\n",
" [-1 0 0 0 -1]\n",
" [ 0 0 0 0 0]\n",
" [ 0 0 0 0 0]\n",
" [ 0 0 0 0 0]] reward= -11.0 done= False\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | O | | | | O |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
"Action 3 0 1\n",
"obs= [[ 0 -1 0 0 0]\n",
" [-1 0 0 0 -1]\n",
" [ 0 0 0 0 0]\n",
" [ 0 0 0 0 0]\n",
" [ 0 0 0 0 0]] reward= -1.0 done= False\n",
"----------------------\n",
" | | O | | | |\n",
"----------------------\n",
" | O | | | | O |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
"Action 4 2 4\n",
"obs= [[ 0 -1 0 0 0]\n",
" [-1 0 0 0 -1]\n",
" [ 0 0 0 0 -1]\n",
" [ 0 0 0 0 0]\n",
" [ 0 0 0 0 0]] reward= -1.0 done= False\n",
"----------------------\n",
" | | O | | | |\n",
"----------------------\n",
" | O | | | | O |\n",
"----------------------\n",
" | | | | | O |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
"Action 5 3 2\n",
"obs= [[ 0 -1 0 0 0]\n",
" [-1 0 0 0 -1]\n",
" [ 0 0 0 0 -1]\n",
" [ 0 0 -1 0 0]\n",
" [ 0 0 0 0 0]] reward= -1.0 done= False\n",
"----------------------\n",
" | | O | | | |\n",
"----------------------\n",
" | O | | | | O |\n",
"----------------------\n",
" | | | | | O |\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
"Action 6 2 3\n",
"obs= [[ 0 -1 0 0 0]\n",
" [-1 0 0 0 -1]\n",
" [ 0 0 0 -1 -1]\n",
" [ 0 0 -1 0 0]\n",
" [ 0 0 0 0 0]] reward= -1.0 done= False\n",
"----------------------\n",
" | | O | | | |\n",
"----------------------\n",
" | O | | | | O |\n",
"----------------------\n",
" | | | | O | O |\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
"Action 7 1 3\n",
"obs= [[ 0 -1 0 0 0]\n",
" [-1 0 0 -1 -1]\n",
" [ 0 0 0 -1 -1]\n",
" [ 0 0 -1 0 0]\n",
" [ 0 0 0 0 0]] reward= -1.0 done= False\n",
"----------------------\n",
" | | O | | | |\n",
"----------------------\n",
" | O | | | O | O |\n",
"----------------------\n",
" | | | | O | O |\n",
"----------------------\n",
" | | | O | | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
"Action 8 3 3\n",
"obs= [[ 0 -1 0 0 0]\n",
" [-1 0 0 -1 -1]\n",
" [ 0 0 0 -1 -1]\n",
" [ 0 0 -1 -1 0]\n",
" [ 0 0 0 0 0]] reward= -1.0 done= False\n",
"----------------------\n",
" | | O | | | |\n",
"----------------------\n",
" | O | | | O | O |\n",
"----------------------\n",
" | | | | O | O |\n",
"----------------------\n",
" | | | O | O | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
"Action 9 0 1\n",
"obs= [[ 0 -1 0 0 0]\n",
" [-1 0 0 -1 -1]\n",
" [-1 0 0 -1 -1]\n",
" [ 0 0 -1 -1 0]\n",
" [ 0 0 0 0 0]] reward= -11.0 done= False\n",
"----------------------\n",
" | | O | | | |\n",
"----------------------\n",
" | O | | | O | O |\n",
"----------------------\n",
" | O | | | O | O |\n",
"----------------------\n",
" | | | O | O | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
"Action 10 1 1\n",
"obs= [[ 0 -1 0 0 0]\n",
" [-1 1 0 -1 -1]\n",
" [-1 0 0 -1 -1]\n",
" [ 0 0 -1 -1 0]\n",
" [ 0 0 0 0 0]] reward= 4.0 done= False\n",
"----------------------\n",
" | | O | | | |\n",
"----------------------\n",
" | O | X | | O | O |\n",
"----------------------\n",
" | O | | | O | O |\n",
"----------------------\n",
" | | | O | O | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
"Action 11 1 1\n",
"obs= [[ 0 -1 0 0 -1]\n",
" [-1 1 0 -1 -1]\n",
" [-1 0 0 -1 -1]\n",
" [ 0 0 -1 -1 0]\n",
" [ 0 0 0 0 0]] reward= -11.0 done= False\n",
"----------------------\n",
" | | O | | | O |\n",
"----------------------\n",
" | O | X | | O | O |\n",
"----------------------\n",
" | O | | | O | O |\n",
"----------------------\n",
" | | | O | O | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
"Action 12 0 1\n",
"obs= [[ 0 -1 0 0 -1]\n",
" [-1 1 0 -1 -1]\n",
" [-1 0 -1 -1 -1]\n",
" [ 0 0 -1 -1 0]\n",
" [ 0 0 0 0 0]] reward= -11.0 done= False\n",
"----------------------\n",
" | | O | | | O |\n",
"----------------------\n",
" | O | X | | O | O |\n",
"----------------------\n",
" | O | | O | O | O |\n",
"----------------------\n",
" | | | O | O | |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
"Action 13 3 4\n",
"obs= [[ 0 -1 0 0 -1]\n",
" [-1 1 0 -1 -1]\n",
" [-1 0 -1 -1 -1]\n",
" [ 0 0 -1 -1 -1]\n",
" [ 0 0 0 0 0]] reward= -1.0 done= False\n",
"----------------------\n",
" | | O | | | O |\n",
"----------------------\n",
" | O | X | | O | O |\n",
"----------------------\n",
" | O | | O | O | O |\n",
"----------------------\n",
" | | | O | O | O |\n",
"----------------------\n",
" | | | | | |\n",
"----------------------\n",
"Action 14 4 4\n",
"obs= [[ 0 -1 0 0 -1]\n",
" [-1 1 0 -1 -1]\n",
" [-1 0 -1 -1 -1]\n",
" [ 0 0 -1 -1 -1]\n",
" [ 0 0 0 0 -1]] reward= -1.0 done= False\n",
"----------------------\n",
" | | O | | | O |\n",
"----------------------\n",
" | O | X | | O | O |\n",
"----------------------\n",
" | O | | O | O | O |\n",
"----------------------\n",
" | | | O | O | O |\n",
"----------------------\n",
" | | | | | O |\n",
"----------------------\n",
"Action 15 3 1\n",
"obs= [[ 0 -1 0 0 -1]\n",
" [-1 1 0 -1 -1]\n",
" [-1 0 -1 -1 -1]\n",
" [ 0 1 -1 -1 -1]\n",
" [ 0 0 0 0 -1]] reward= 4.0 done= False\n",
"----------------------\n",
" | | O | | | O |\n",
"----------------------\n",
" | O | X | | O | O |\n",
"----------------------\n",
" | O | | O | O | O |\n",
"----------------------\n",
" | | X | O | O | O |\n",
"----------------------\n",
" | | | | | O |\n",
"----------------------\n",
"Action 16 1 3\n",
"obs= [[ 0 -1 -1 0 -1]\n",
" [-1 1 0 -1 -1]\n",
" [-1 0 -1 -1 -1]\n",
" [ 0 1 -1 -1 -1]\n",
" [ 0 0 0 0 -1]] reward= -11.0 done= False\n",
"----------------------\n",
" | | O | O | | O |\n",
"----------------------\n",
" | O | X | | O | O |\n",
"----------------------\n",
" | O | | O | O | O |\n",
"----------------------\n",
" | | X | O | O | O |\n",
"----------------------\n",
" | | | | | O |\n",
"----------------------\n",
"Action 17 1 3\n",
"obs= [[ 0 -1 -1 0 -1]\n",
" [-1 1 0 -1 -1]\n",
" [-1 0 -1 -1 -1]\n",
" [ 0 1 -1 -1 -1]\n",
" [ 0 0 0 -1 -1]] reward= -11.0 done= False\n",
"----------------------\n",
" | | O | O | | O |\n",
"----------------------\n",
" | O | X | | O | O |\n",
"----------------------\n",
" | O | | O | O | O |\n",
"----------------------\n",
" | | X | O | O | O |\n",
"----------------------\n",
" | | | | O | O |\n",
"----------------------\n",
"Action 18 3 1\n",
"obs= [[ 0 -1 -1 0 -1]\n",
" [-1 1 -1 -1 -1]\n",
" [-1 0 -1 -1 -1]\n",
" [ 0 1 -1 -1 -1]\n",
" [ 0 0 0 -1 -1]] reward= -11.0 done= False\n",
"----------------------\n",
" | | O | O | | O |\n",
"----------------------\n",
" | O | X | O | O | O |\n",
"----------------------\n",
" | O | | O | O | O |\n",
"----------------------\n",
" | | X | O | O | O |\n",
"----------------------\n",
" | | | | O | O |\n",
"----------------------\n",
"Action 19 1 2\n",
"obs= [[ 0 -1 -1 0 -1]\n",
" [-1 1 -1 -1 -1]\n",
" [-1 1 -1 -1 -1]\n",
" [ 0 1 -1 -1 -1]\n",
" [ 0 0 0 -1 -1]] reward= 14.0 done= True\n",
"----------------------\n",
" | | O | O | | O |\n",
"----------------------\n",
" | O | X | O | O | O |\n",
"----------------------\n",
" | O | X | O | O | O |\n",
"----------------------\n",
" | | X | O | O | O |\n",
"----------------------\n",
" | | | | O | O |\n",
"----------------------\n",
"Goal reached! reward= 14.0\n"
]
}
],
"source": [
"# Test environment\n",
"# ships\n",
"ships = {}\n",
"ships['cruiser'] = 3\n",
"\n",
"grid_size=5\n",
"env = BattleshipEnv(enemy_board=None, ship_locs={}, grid_size=grid_size, ships=ships)\n",
"\n",
"for ep in range(2):\n",
" print('Episode', ep)\n",
" obs = env.reset()\n",
" #env.render()\n",
" #print(env.enemy_board)\n",
" done = False\n",
" t = 0\n",
" while not done:\n",
" action = env.action_space.sample()\n",
" i, j = np.unravel_index(action, (grid_size,grid_size)) \n",
" print(\"Action {}\".format(t + 1), i, j)\n",
" obs, reward, done, _ = env.step(action)\n",
" print('obs=', obs, 'reward=', reward, 'done=', done)\n",
" env.render()\n",
" t += 1\n",
" if done:\n",
" print(\"Goal reached!\", \"reward=\", reward)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Callback and Plotting"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from stable_baselines import DQN, PPO2, A2C, ACKTR, TRPO\n",
"from stable_baselines.bench import Monitor\n",
"from stable_baselines.common.vec_env import DummyVecEnv\n",
"import os\n",
"\n",
"from stable_baselines.results_plotter import load_results, ts2xy\n",
"\n",
"from tensorflow.keras.backend import clear_session #not sure if we need this but it does not hurt\n",
"\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"## This callback function is legacy and needs to be replaced with object oriented functions\n",
"## to work with all policies. See next callback function\n",
"\n",
"def callback(_locals, _globals):\n",
" \"\"\"\n",
" Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2)\n",
" :param _locals: (dict)\n",
" :param _globals: (dict)\n",
" \"\"\"\n",
" global n_steps, best_mean_reward\n",
" # Print stats every step_interval calls\n",
" if (n_steps + 1) % step_interval == 0:\n",
" # Evaluate policy training performance\n",
" x, y = ts2xy(load_results(log_dir), 'timesteps')\n",
" if len(x) > 0:\n",
" # NOTE: when done is True, timesteps are counted and reported to the log_dir\n",
" mean_reward = np.mean(y[-episode_interval:]) # mean reward over previous episode_interval episodes\n",
" mean_moves = np.mean(np.diff(x[-episode_interval:])) # mean moves over previous episode_interval episodes\n",
" print(x[-1], 'timesteps') # closest to step_interval step number\n",
" print(\"Best mean reward: {:.2f} - Last mean reward per episode: {:.2f} - Last mean moves per episode: {:.2f}\".format(best_mean_reward, \n",
" mean_reward, mean_moves))\n",
"\n",
" # New best model, you could save the agent here\n",
" if mean_reward > best_mean_reward:\n",
" best_mean_reward = mean_reward\n",
" # Example for saving best model\n",
" print(\"Saving new best model\")\n",
" _locals['self'].save(log_dir + 'best_model.pkl')\n",
" n_steps += 1\n",
" # Returning False will stop training early\n",
" return True"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from stable_baselines.common.callbacks import BaseCallback\n",
"\n",
"class SaveOnBestTrainingRewardCallback(BaseCallback):\n",
" \"\"\"\n",
" Callback for saving a model (the check is done every ``check_freq`` steps)\n",
" based on the training reward (in practice, we recommend using ``EvalCallback``).\n",
"\n",
" :param check_freq: (int)\n",
" :param log_dir: (str) Path to the folder where the model will be saved.\n",
" It must contains the file created by the ``Monitor`` wrapper.\n",
" :param verbose: (int)\n",
" \"\"\"\n",
" def __init__(self, check_freq: int, episode_interval: int, log_dir: str, verbose=1):\n",
" super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)\n",
" self.check_freq = check_freq\n",
" self.episode_interval = episode_interval\n",
" self.log_dir = log_dir\n",
" self.save_path = os.path.join(log_dir, 'best_model.pkl')\n",
" self.best_mean_reward = -np.inf\n",
"\n",
" def _init_callback(self) -> None:\n",
" # Create folder if needed\n",
" if self.save_path is not None:\n",
" os.makedirs(self.save_path, exist_ok=True)\n",
"\n",
" def _on_step(self) -> bool:\n",
" if self.n_calls % self.check_freq == 0:\n",
" # Evaluate policy training performance\n",
" x, y = ts2xy(load_results(self.log_dir), 'timesteps')\n",
" if len(x) > 0:\n",
" # NOTE: when done is True, timesteps are counted and reported to the log_dir\n",
" mean_reward = np.mean(y[-self.episode_interval:]) # mean reward over previous episode_interval episodes\n",
" mean_moves = np.mean(np.diff(x[-self.episode_interval:])) # mean moves over previous 100 episodes\n",
" if self.verbose > 0:\n",
" print(x[-1], 'timesteps') # closest to step_interval step number\n",
" print(\"Best mean reward: {:.2f} - Last mean reward per episode: {:.2f} - Last mean moves per episode: {:.2f}\".format(self.best_mean_reward, \n",
" mean_reward, mean_moves))\n",
"\n",
" # New best model, you could save the agent here\n",
" if mean_reward > self.best_mean_reward:\n",
" self.best_mean_reward = mean_reward\n",
" # Example for saving best model\n",
" if self.verbose > 0:\n",
" print(\"Saving new best model\")\n",
" self.model.save(self.save_path)\n",
"\n",
" return True"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def moving_average(values, window):\n",
" \"\"\"\n",
" Smooth values by doing a moving average\n",
" :param values: (numpy array)\n",
" :param window: (int)\n",
" :return: (numpy array)\n",
" \"\"\"\n",
" weights = np.repeat(1.0, window) / window\n",
" return np.convolve(values, weights, 'valid')\n",
"\n",
"\n",
"def plot_results(log_folder, window = 100, title='Learning Curve'):\n",
" \"\"\"\n",
" plot the results\n",
"\n",
" :param log_folder: (str) the save location of the results to plot\n",
" :param title: (str) the title of the task to plot\n",
" \"\"\"\n",
" \n",
" x, y = ts2xy(load_results(log_folder), 'timesteps')\n",
" y = moving_average(y, window=window)\n",
" y_moves = moving_average(np.diff(x), window = window) \n",
" # Truncate x\n",
" x = x[len(x) - len(y):]\n",
" x_moves = x[len(x) - len(y_moves):]\n",
"\n",
" title = 'Smoothed Learning Curve of Rewards (every ' + str(window) +' steps)'\n",
" fig = plt.figure(title)\n",
" plt.plot(x, y)\n",
" plt.xlabel('Number of Timesteps')\n",
" plt.ylabel('Rewards')\n",
" plt.title(title)\n",
" plt.show()\n",
"\n",
" title = 'Smoothed Learning Curve of Moves (every ' + str(window) +' steps)'\n",
" fig = plt.figure(title)\n",
" plt.plot(x_moves, y_moves)\n",
" plt.xlabel('Number of Timesteps')\n",
" plt.ylabel('Moves')\n",
" plt.title(title)\n",
" plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Playing with One Ship on a 5x5 board"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From /Users/atambu310/anaconda3/lib/python3.7/site-packages/stable_baselines/common/tf_util.py:191: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.\n",
"\n",
"WARNING:tensorflow:From /Users/atambu310/anaconda3/lib/python3.7/site-packages/stable_baselines/common/tf_util.py:200: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.\n",
"\n",
"WARNING:tensorflow:From /Users/atambu310/anaconda3/lib/python3.7/site-packages/stable_baselines/common/policies.py:116: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.\n",
"\n",
"WARNING:tensorflow:From /Users/atambu310/anaconda3/lib/python3.7/site-packages/stable_baselines/common/input.py:25: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.\n",
"\n",
"WARNING:tensorflow:From /Users/atambu310/anaconda3/lib/python3.7/site-packages/stable_baselines/common/policies.py:561: flatten (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"Use keras.layers.flatten instead.\n",
"WARNING:tensorflow:From /Users/atambu310/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/layers/core.py:332: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"Please use `layer.__call__` method instead.\n",
"WARNING:tensorflow:From /Users/atambu310/anaconda3/lib/python3.7/site-packages/stable_baselines/common/tf_layers.py:123: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.\n",
"\n",
"WARNING:tensorflow:From /Users/atambu310/anaconda3/lib/python3.7/site-packages/stable_baselines/common/distributions.py:326: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.\n",
"\n",
"WARNING:tensorflow:From /Users/atambu310/anaconda3/lib/python3.7/site-packages/stable_baselines/common/distributions.py:327: The name tf.log is deprecated. Please use tf.math.log instead.\n",
"\n",
"WARNING:tensorflow:From /Users/atambu310/anaconda3/lib/python3.7/site-packages/stable_baselines/a2c/a2c.py:158: The name tf.summary.scalar is deprecated. Please use tf.compat.v1.summary.scalar instead.\n",
"\n",
"WARNING:tensorflow:From /Users/atambu310/anaconda3/lib/python3.7/site-packages/stable_baselines/common/tf_util.py:449: The name tf.get_collection is deprecated. Please use tf.compat.v1.get_collection instead.\n",
"\n",
"WARNING:tensorflow:From /Users/atambu310/anaconda3/lib/python3.7/site-packages/stable_baselines/common/tf_util.py:449: The name tf.GraphKeys is deprecated. Please use tf.compat.v1.GraphKeys instead.\n",
"\n",
"WARNING:tensorflow:From /Users/atambu310/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/clip_ops.py:301: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"Use tf.where in 2.0, which has the same broadcast rule as np.where\n",
"WARNING:tensorflow:From /Users/atambu310/anaconda3/lib/python3.7/site-packages/stable_baselines/a2c/a2c.py:182: The name tf.train.RMSPropOptimizer is deprecated. Please use tf.compat.v1.train.RMSPropOptimizer instead.\n",
"\n",
"WARNING:tensorflow:From /Users/atambu310/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/training/rmsprop.py:119: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"Call initializer instance with the dtype argument instead of passing it to the constructor\n",
"WARNING:tensorflow:From /Users/atambu310/anaconda3/lib/python3.7/site-packages/stable_baselines/a2c/a2c.py:192: The name tf.global_variables_initializer is deprecated. Please use tf.compat.v1.global_variables_initializer instead.\n",
"\n",
"WARNING:tensorflow:From /Users/atambu310/anaconda3/lib/python3.7/site-packages/stable_baselines/a2c/a2c.py:194: The name tf.summary.merge_all is deprecated. Please use tf.compat.v1.summary.merge_all instead.\n",
"\n",
"9989 timesteps\n",
"Best mean reward: -inf - Last mean reward per episode: -58.99 - Last mean moves per episode: 19.24\n",
"Saving new best model\n",
"19988 timesteps\n",
"Best mean reward: -58.99 - Last mean reward per episode: -56.79 - Last mean moves per episode: 19.14\n",
"Saving new best model\n",
"29999 timesteps\n",
"Best mean reward: -56.79 - Last mean reward per episode: -54.75 - Last mean moves per episode: 18.89\n",
"Saving new best model\n",
"39992 timesteps\n",
"Best mean reward: -54.75 - Last mean reward per episode: -52.27 - Last mean moves per episode: 18.52\n",
"Saving new best model\n",
"49998 timesteps\n",
"Best mean reward: -52.27 - Last mean reward per episode: -49.64 - Last mean moves per episode: 18.19\n",
"Saving new best model\n",
"59996 timesteps\n",
"Best mean reward: -49.64 - Last mean reward per episode: -47.21 - Last mean moves per episode: 17.87\n",
"Saving new best model\n",
"69984 timesteps\n",
"Best mean reward: -47.21 - Last mean reward per episode: -45.26 - Last mean moves per episode: 17.60\n",
"Saving new best model\n",
"79998 timesteps\n",
"Best mean reward: -45.26 - Last mean reward per episode: -43.89 - Last mean moves per episode: 17.36\n",
"Saving new best model\n",
"89990 timesteps\n",
"Best mean reward: -43.89 - Last mean reward per episode: -42.83 - Last mean moves per episode: 17.20\n",
"Saving new best model\n",
"99992 timesteps\n",
"Best mean reward: -42.83 - Last mean reward per episode: -41.87 - Last mean moves per episode: 17.01\n",
"Saving new best model\n",
"109998 timesteps\n",
"Best mean reward: -41.87 - Last mean reward per episode: -41.10 - Last mean moves per episode: 16.86\n",
"Saving new best model\n",
"119985 timesteps\n",
"Best mean reward: -41.10 - Last mean reward per episode: -40.31 - Last mean moves per episode: 16.70\n",
"Saving new best model\n",
"129994 timesteps\n",
"Best mean reward: -40.31 - Last mean reward per episode: -39.70 - Last mean moves per episode: 16.57\n",
"Saving new best model\n",
"139990 timesteps\n",
"Best mean reward: -39.70 - Last mean reward per episode: -39.06 - Last mean moves per episode: 16.44\n",
"Saving new best model\n",
"149998 timesteps\n",
"Best mean reward: -39.06 - Last mean reward per episode: -38.37 - Last mean moves per episode: 16.31\n",
"Saving new best model\n",
"159993 timesteps\n",
"Best mean reward: -38.37 - Last mean reward per episode: -37.59 - Last mean moves per episode: 16.17\n",
"Saving new best model\n",
"170000 timesteps\n",
"Best mean reward: -37.59 - Last mean reward per episode: -35.67 - Last mean moves per episode: 15.86\n",
"Saving new best model\n",
"179999 timesteps\n",
"Best mean reward: -35.67 - Last mean reward per episode: -34.03 - Last mean moves per episode: 15.54\n",
"Saving new best model\n",
"189987 timesteps\n",
"Best mean reward: -34.03 - Last mean reward per episode: -32.54 - Last mean moves per episode: 15.25\n",
"Saving new best model\n",
"199988 timesteps\n",
"Best mean reward: -32.54 - Last mean reward per episode: -31.71 - Last mean moves per episode: 15.04\n",
"Saving new best model\n",
"209997 timesteps\n",
"Best mean reward: -31.71 - Last mean reward per episode: -30.81 - Last mean moves per episode: 14.83\n",
"Saving new best model\n",
"219998 timesteps\n",
"Best mean reward: -30.81 - Last mean reward per episode: -30.28 - Last mean moves per episode: 14.69\n",
"Saving new best model\n",
"229995 timesteps\n",
"Best mean reward: -30.28 - Last mean reward per episode: -29.47 - Last mean moves per episode: 14.52\n",
"Saving new best model\n",
"239992 timesteps\n",
"Best mean reward: -29.47 - Last mean reward per episode: -28.47 - Last mean moves per episode: 14.31\n",
"Saving new best model\n",
"249991 timesteps\n",
"Best mean reward: -28.47 - Last mean reward per episode: -27.78 - Last mean moves per episode: 14.15\n",
"Saving new best model\n",
"259995 timesteps\n",
"Best mean reward: -27.78 - Last mean reward per episode: -26.98 - Last mean moves per episode: 13.96\n",
"Saving new best model\n",
"269994 timesteps\n",
"Best mean reward: -26.98 - Last mean reward per episode: -26.03 - Last mean moves per episode: 13.77\n",
"Saving new best model\n",
"280000 timesteps\n",
"Best mean reward: -26.03 - Last mean reward per episode: -24.98 - Last mean moves per episode: 13.57\n",
"Saving new best model\n",
"289989 timesteps\n",
"Best mean reward: -24.98 - Last mean reward per episode: -24.37 - Last mean moves per episode: 13.41\n",
"Saving new best model\n",
"299991 timesteps\n",
"Best mean reward: -24.37 - Last mean reward per episode: -23.52 - Last mean moves per episode: 13.25\n",
"Saving new best model\n",
"309991 timesteps\n",
"Best mean reward: -23.52 - Last mean reward per episode: -22.70 - Last mean moves per episode: 13.11\n",
"Saving new best model\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"319999 timesteps\n",
"Best mean reward: -22.70 - Last mean reward per episode: -21.51 - Last mean moves per episode: 12.94\n",
"Saving new best model\n",
"330000 timesteps\n",
"Best mean reward: -21.51 - Last mean reward per episode: -20.37 - Last mean moves per episode: 12.75\n",
"Saving new best model\n",
"340000 timesteps\n",
"Best mean reward: -20.37 - Last mean reward per episode: -19.19 - Last mean moves per episode: 12.57\n",
"Saving new best model\n",
"349989 timesteps\n",
"Best mean reward: -19.19 - Last mean reward per episode: -18.13 - Last mean moves per episode: 12.39\n",
"Saving new best model\n",
"359988 timesteps\n",
"Best mean reward: -18.13 - Last mean reward per episode: -17.23 - Last mean moves per episode: 12.25\n",
"Saving new best model\n",
"369989 timesteps\n",
"Best mean reward: -17.23 - Last mean reward per episode: -16.05 - Last mean moves per episode: 12.11\n",
"Saving new best model\n",
"379995 timesteps\n",
"Best mean reward: -16.05 - Last mean reward per episode: -14.75 - Last mean moves per episode: 11.96\n",
"Saving new best model\n",
"389993 timesteps\n",
"Best mean reward: -14.75 - Last mean reward per episode: -13.17 - Last mean moves per episode: 11.78\n",
"Saving new best model\n",
"399993 timesteps\n",
"Best mean reward: -13.17 - Last mean reward per episode: -11.83 - Last mean moves per episode: 11.62\n",
"Saving new best model\n",
"409990 timesteps\n",
"Best mean reward: -11.83 - Last mean reward per episode: -9.49 - Last mean moves per episode: 11.35\n",
"Saving new best model\n",
"419998 timesteps\n",
"Best mean reward: -9.49 - Last mean reward per episode: -7.48 - Last mean moves per episode: 11.08\n",
"Saving new best model\n",
"429988 timesteps\n",
"Best mean reward: -7.48 - Last mean reward per episode: -5.50 - Last mean moves per episode: 10.85\n",
"Saving new best model\n",
"440000 timesteps\n",
"Best mean reward: -5.50 - Last mean reward per episode: -3.12 - Last mean moves per episode: 10.54\n",
"Saving new best model\n",
"449992 timesteps\n",
"Best mean reward: -3.12 - Last mean reward per episode: -0.53 - Last mean moves per episode: 10.21\n",
"Saving new best model\n",
"459997 timesteps\n",
"Best mean reward: -0.53 - Last mean reward per episode: 2.45 - Last mean moves per episode: 9.84\n",
"Saving new best model\n",
"469998 timesteps\n",
"Best mean reward: 2.45 - Last mean reward per episode: 5.42 - Last mean moves per episode: 9.46\n",
"Saving new best model\n",
"479996 timesteps\n",
"Best mean reward: 5.42 - Last mean reward per episode: 7.92 - Last mean moves per episode: 9.12\n",
"Saving new best model\n",
"489995 timesteps\n",
"Best mean reward: 7.92 - Last mean reward per episode: 10.11 - Last mean moves per episode: 8.84\n",
"Saving new best model\n",
"499997 timesteps\n",
"Best mean reward: 10.11 - Last mean reward per episode: 12.24 - Last mean moves per episode: 8.58\n",
"Saving new best model\n",
"509998 timesteps\n",
"Best mean reward: 12.24 - Last mean reward per episode: 14.10 - Last mean moves per episode: 8.35\n",
"Saving new best model\n",
"519997 timesteps\n",
"Best mean reward: 14.10 - Last mean reward per episode: 15.50 - Last mean moves per episode: 8.18\n",
"Saving new best model\n",
"529999 timesteps\n",
"Best mean reward: 15.50 - Last mean reward per episode: 16.69 - Last mean moves per episode: 8.05\n",
"Saving new best model\n",
"540000 timesteps\n",
"Best mean reward: 16.69 - Last mean reward per episode: 17.56 - Last mean moves per episode: 7.97\n",
"Saving new best model\n",
"549995 timesteps\n",
"Best mean reward: 17.56 - Last mean reward per episode: 18.30 - Last mean moves per episode: 7.89\n",
"Saving new best model\n",
"559994 timesteps\n",
"Best mean reward: 18.30 - Last mean reward per episode: 18.84 - Last mean moves per episode: 7.85\n",
"Saving new best model\n",
"569992 timesteps\n",
"Best mean reward: 18.84 - Last mean reward per episode: 19.57 - Last mean moves per episode: 7.77\n",
"Saving new best model\n",
"579991 timesteps\n",
"Best mean reward: 19.57 - Last mean reward per episode: 20.18 - Last mean moves per episode: 7.72\n",
"Saving new best model\n",
"589999 timesteps\n",
"Best mean reward: 20.18 - Last mean reward per episode: 20.43 - Last mean moves per episode: 7.70\n",
"Saving new best model\n",
"599999 timesteps\n",
"Best mean reward: 20.43 - Last mean reward per episode: 20.88 - Last mean moves per episode: 7.65\n",
"Saving new best model\n",
"609995 timesteps\n",
"Best mean reward: 20.88 - Last mean reward per episode: 21.29 - Last mean moves per episode: 7.60\n",
"Saving new best model\n",
"619988 timesteps\n",
"Best mean reward: 21.29 - Last mean reward per episode: 21.57 - Last mean moves per episode: 7.55\n",
"Saving new best model\n",
"630000 timesteps\n",
"Best mean reward: 21.57 - Last mean reward per episode: 21.90 - Last mean moves per episode: 7.52\n",
"Saving new best model\n",
"639997 timesteps\n",
"Best mean reward: 21.90 - Last mean reward per episode: 22.09 - Last mean moves per episode: 7.50\n",
"Saving new best model\n",
"649994 timesteps\n",
"Best mean reward: 22.09 - Last mean reward per episode: 22.17 - Last mean moves per episode: 7.49\n",
"Saving new best model\n",
"659998 timesteps\n",
"Best mean reward: 22.17 - Last mean reward per episode: 22.32 - Last mean moves per episode: 7.46\n",
"Saving new best model\n",
"669992 timesteps\n",
"Best mean reward: 22.32 - Last mean reward per episode: 22.50 - Last mean moves per episode: 7.44\n",
"Saving new best model\n",
"679996 timesteps\n",
"Best mean reward: 22.50 - Last mean reward per episode: 22.72 - Last mean moves per episode: 7.41\n",
"Saving new best model\n",
"689992 timesteps\n",
"Best mean reward: 22.72 - Last mean reward per episode: 22.79 - Last mean moves per episode: 7.40\n",
"Saving new best model\n",
"699999 timesteps\n",
"Best mean reward: 22.79 - Last mean reward per episode: 22.87 - Last mean moves per episode: 7.40\n",
"Saving new best model\n",
"709998 timesteps\n",
"Best mean reward: 22.87 - Last mean reward per episode: 22.77 - Last mean moves per episode: 7.40\n",
"719985 timesteps\n",
"Best mean reward: 22.87 - Last mean reward per episode: 22.65 - Last mean moves per episode: 7.40\n",
"729999 timesteps\n",
"Best mean reward: 22.87 - Last mean reward per episode: 22.57 - Last mean moves per episode: 7.42\n",
"739994 timesteps\n",
"Best mean reward: 22.87 - Last mean reward per episode: 22.58 - Last mean moves per episode: 7.42\n",
"749997 timesteps\n",
"Best mean reward: 22.87 - Last mean reward per episode: 22.50 - Last mean moves per episode: 7.42\n",
"760000 timesteps\n",
"Best mean reward: 22.87 - Last mean reward per episode: 22.52 - Last mean moves per episode: 7.43\n",
"770000 timesteps\n",
"Best mean reward: 22.87 - Last mean reward per episode: 22.73 - Last mean moves per episode: 7.42\n",
"779981 timesteps\n",
"Best mean reward: 22.87 - Last mean reward per episode: 23.10 - Last mean moves per episode: 7.38\n",
"Saving new best model\n",
"789997 timesteps\n",
"Best mean reward: 23.10 - Last mean reward per episode: 23.41 - Last mean moves per episode: 7.35\n",
"Saving new best model\n",
"799997 timesteps\n",
"Best mean reward: 23.41 - Last mean reward per episode: 23.98 - Last mean moves per episode: 7.30\n",
"Saving new best model\n",
"809994 timesteps\n",
"Best mean reward: 23.98 - Last mean reward per episode: 24.33 - Last mean moves per episode: 7.27\n",
"Saving new best model\n",
"819992 timesteps\n",
"Best mean reward: 24.33 - Last mean reward per episode: 24.73 - Last mean moves per episode: 7.24\n",
"Saving new best model\n",
"829997 timesteps\n",
"Best mean reward: 24.73 - Last mean reward per episode: 24.89 - Last mean moves per episode: 7.22\n",
"Saving new best model\n",
"839995 timesteps\n",
"Best mean reward: 24.89 - Last mean reward per episode: 25.00 - Last mean moves per episode: 7.21\n",
"Saving new best model\n",
"849998 timesteps\n",
"Best mean reward: 25.00 - Last mean reward per episode: 25.07 - Last mean moves per episode: 7.20\n",
"Saving new best model\n",
"859982 timesteps\n",
"Best mean reward: 25.07 - Last mean reward per episode: 25.06 - Last mean moves per episode: 7.21\n",
"869996 timesteps\n",
"Best mean reward: 25.07 - Last mean reward per episode: 25.05 - Last mean moves per episode: 7.19\n",
"879999 timesteps\n",
"Best mean reward: 25.07 - Last mean reward per episode: 25.05 - Last mean moves per episode: 7.19\n",
"889997 timesteps\n",
"Best mean reward: 25.07 - Last mean reward per episode: 25.06 - Last mean moves per episode: 7.17\n",
"899993 timesteps\n",
"Best mean reward: 25.07 - Last mean reward per episode: 25.23 - Last mean moves per episode: 7.16\n",
"Saving new best model\n",
"909998 timesteps\n",
"Best mean reward: 25.23 - Last mean reward per episode: 25.23 - Last mean moves per episode: 7.16\n",
"Saving new best model\n",
"920000 timesteps\n",
"Best mean reward: 25.23 - Last mean reward per episode: 25.28 - Last mean moves per episode: 7.16\n",
"Saving new best model\n",
"930000 timesteps\n",
"Best mean reward: 25.28 - Last mean reward per episode: 25.41 - Last mean moves per episode: 7.14\n",
"Saving new best model\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"940000 timesteps\n",
"Best mean reward: 25.41 - Last mean reward per episode: 25.49 - Last mean moves per episode: 7.14\n",
"Saving new best model\n",
"949998 timesteps\n",
"Best mean reward: 25.49 - Last mean reward per episode: 25.56 - Last mean moves per episode: 7.14\n",
"Saving new best model\n",
"960000 timesteps\n",
"Best mean reward: 25.56 - Last mean reward per episode: 25.61 - Last mean moves per episode: 7.13\n",
"Saving new best model\n",
"969996 timesteps\n",
"Best mean reward: 25.61 - Last mean reward per episode: 25.75 - Last mean moves per episode: 7.13\n",
"Saving new best model\n",
"979997 timesteps\n",
"Best mean reward: 25.75 - Last mean reward per episode: 25.89 - Last mean moves per episode: 7.11\n",
"Saving new best model\n",
"989998 timesteps\n",
"Best mean reward: 25.89 - Last mean reward per episode: 25.85 - Last mean moves per episode: 7.14\n",
"1000000 timesteps\n",
"Best mean reward: 25.89 - Last mean reward per episode: 25.97 - Last mean moves per episode: 7.12\n",
"Saving new best model\n"
]
}
],
"source": [
"clear_session()\n",
"\n",
"# ships -- keep only one kind for 5x5 grid\n",
"ships = {}\n",
"ships['cruiser'] = 3\n",
"\n",
"grid_size = 5\n",
"num_timesteps = 1000000 # this is number of moves and not number of episodes\n",
"\n",
"best_mean_reward, n_steps, step_interval, episode_interval = -np.inf, 0, 10000, 10000\n",
"\n",
"# Instantiate the env\n",
"env = BattleshipEnv(enemy_board=None, ship_locs={}, grid_size=grid_size, ships=ships)\n",
"\n",
"# wrap it\n",
"log_dir = \"./gym/\"\n",
"os.makedirs(log_dir, exist_ok=True)\n",
"env = Monitor(env, filename=log_dir, allow_early_resets=True)\n",
"env = DummyVecEnv([lambda: env])\n",
"\n",
"# Train the agent - Note: best model is not save in Callback function for PPO2; save manually\n",
"model = A2C('MlpPolicy', env, verbose=0).learn(total_timesteps=num_timesteps, callback=callback)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#model.save(log_dir + 'best_model_cruiser_5x5.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plot_results(log_dir, 1000)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Playing with One Ship on a Bigger Board"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"9995 timesteps\n",
"Best mean reward: -inf - Last mean reward per episode: -103.43 - Last mean moves per episode: 27.14\n",
"Saving new best model\n",
"19977 timesteps\n",
"Best mean reward: -103.43 - Last mean reward per episode: -102.78 - Last mean moves per episode: 27.13\n",
"Saving new best model\n",
"29985 timesteps\n",
"Best mean reward: -102.78 - Last mean reward per episode: -101.01 - Last mean moves per episode: 27.01\n",
"Saving new best model\n",
"39979 timesteps\n",
"Best mean reward: -101.01 - Last mean reward per episode: -97.92 - Last mean moves per episode: 26.83\n",
"Saving new best model\n",
"49997 timesteps\n",
"Best mean reward: -97.92 - Last mean reward per episode: -94.76 - Last mean moves per episode: 26.63\n",
"Saving new best model\n",
"59982 timesteps\n",
"Best mean reward: -94.76 - Last mean reward per episode: -92.51 - Last mean moves per episode: 26.57\n",
"Saving new best model\n",
"69981 timesteps\n",
"Best mean reward: -92.51 - Last mean reward per episode: -89.68 - Last mean moves per episode: 26.38\n",
"Saving new best model\n",
"79976 timesteps\n",
"Best mean reward: -89.68 - Last mean reward per episode: -87.69 - Last mean moves per episode: 26.25\n",
"Saving new best model\n",
"89986 timesteps\n",
"Best mean reward: -87.69 - Last mean reward per episode: -85.75 - Last mean moves per episode: 26.09\n",
"Saving new best model\n",
"99987 timesteps\n",
"Best mean reward: -85.75 - Last mean reward per episode: -84.27 - Last mean moves per episode: 25.96\n",
"Saving new best model\n",
"109994 timesteps\n",
"Best mean reward: -84.27 - Last mean reward per episode: -83.16 - Last mean moves per episode: 25.82\n",
"Saving new best model\n",
"119999 timesteps\n",
"Best mean reward: -83.16 - Last mean reward per episode: -81.99 - Last mean moves per episode: 25.68\n",
"Saving new best model\n",
"129974 timesteps\n",
"Best mean reward: -81.99 - Last mean reward per episode: -80.97 - Last mean moves per episode: 25.57\n",
"Saving new best model\n",
"139994 timesteps\n",
"Best mean reward: -80.97 - Last mean reward per episode: -79.73 - Last mean moves per episode: 25.43\n",
"Saving new best model\n",
"149972 timesteps\n",
"Best mean reward: -79.73 - Last mean reward per episode: -78.46 - Last mean moves per episode: 25.23\n",
"Saving new best model\n",
"159991 timesteps\n",
"Best mean reward: -78.46 - Last mean reward per episode: -77.81 - Last mean moves per episode: 25.10\n",
"Saving new best model\n",
"169981 timesteps\n",
"Best mean reward: -77.81 - Last mean reward per episode: -77.07 - Last mean moves per episode: 24.98\n",
"Saving new best model\n",
"179991 timesteps\n",
"Best mean reward: -77.07 - Last mean reward per episode: -76.90 - Last mean moves per episode: 24.94\n",
"Saving new best model\n",
"189991 timesteps\n",
"Best mean reward: -76.90 - Last mean reward per episode: -76.30 - Last mean moves per episode: 24.83\n",
"Saving new best model\n",
"199997 timesteps\n",
"Best mean reward: -76.30 - Last mean reward per episode: -75.49 - Last mean moves per episode: 24.68\n",
"Saving new best model\n",
"209986 timesteps\n",
"Best mean reward: -75.49 - Last mean reward per episode: -74.85 - Last mean moves per episode: 24.56\n",
"Saving new best model\n",
"219973 timesteps\n",
"Best mean reward: -74.85 - Last mean reward per episode: -74.48 - Last mean moves per episode: 24.46\n",
"Saving new best model\n",
"229988 timesteps\n",
"Best mean reward: -74.48 - Last mean reward per episode: -74.09 - Last mean moves per episode: 24.38\n",
"Saving new best model\n",
"239987 timesteps\n",
"Best mean reward: -74.09 - Last mean reward per episode: -73.62 - Last mean moves per episode: 24.28\n",
"Saving new best model\n",
"249999 timesteps\n",
"Best mean reward: -73.62 - Last mean reward per episode: -72.37 - Last mean moves per episode: 24.11\n",
"Saving new best model\n",
"259982 timesteps\n",
"Best mean reward: -72.37 - Last mean reward per episode: -71.07 - Last mean moves per episode: 23.95\n",
"Saving new best model\n",
"269984 timesteps\n",
"Best mean reward: -71.07 - Last mean reward per episode: -69.96 - Last mean moves per episode: 23.80\n",
"Saving new best model\n",
"279969 timesteps\n",
"Best mean reward: -69.96 - Last mean reward per episode: -69.34 - Last mean moves per episode: 23.68\n",
"Saving new best model\n",
"289995 timesteps\n",
"Best mean reward: -69.34 - Last mean reward per episode: -68.49 - Last mean moves per episode: 23.53\n",
"Saving new best model\n",
"299980 timesteps\n",
"Best mean reward: -68.49 - Last mean reward per episode: -67.81 - Last mean moves per episode: 23.39\n",
"Saving new best model\n",
"309978 timesteps\n",
"Best mean reward: -67.81 - Last mean reward per episode: -67.73 - Last mean moves per episode: 23.29\n",
"Saving new best model\n",
"319982 timesteps\n",
"Best mean reward: -67.73 - Last mean reward per episode: -67.67 - Last mean moves per episode: 23.22\n",
"Saving new best model\n",
"329998 timesteps\n",
"Best mean reward: -67.67 - Last mean reward per episode: -67.32 - Last mean moves per episode: 23.10\n",
"Saving new best model\n",
"339997 timesteps\n",
"Best mean reward: -67.32 - Last mean reward per episode: -67.06 - Last mean moves per episode: 23.02\n",
"Saving new best model\n",
"349993 timesteps\n",
"Best mean reward: -67.06 - Last mean reward per episode: -66.68 - Last mean moves per episode: 22.95\n",
"Saving new best model\n",
"359991 timesteps\n",
"Best mean reward: -66.68 - Last mean reward per episode: -66.61 - Last mean moves per episode: 22.90\n",
"Saving new best model\n",
"369988 timesteps\n",
"Best mean reward: -66.61 - Last mean reward per episode: -66.90 - Last mean moves per episode: 22.87\n",
"379997 timesteps\n",
"Best mean reward: -66.61 - Last mean reward per episode: -67.15 - Last mean moves per episode: 22.87\n",
"389981 timesteps\n",
"Best mean reward: -66.61 - Last mean reward per episode: -67.01 - Last mean moves per episode: 22.87\n",
"399978 timesteps\n",
"Best mean reward: -66.61 - Last mean reward per episode: -66.97 - Last mean moves per episode: 22.86\n",
"409981 timesteps\n",
"Best mean reward: -66.61 - Last mean reward per episode: -66.45 - Last mean moves per episode: 22.80\n",
"Saving new best model\n",
"419982 timesteps\n",
"Best mean reward: -66.45 - Last mean reward per episode: -66.40 - Last mean moves per episode: 22.82\n",
"Saving new best model\n",
"429993 timesteps\n",
"Best mean reward: -66.40 - Last mean reward per episode: -66.35 - Last mean moves per episode: 22.85\n",
"Saving new best model\n",
"439995 timesteps\n",
"Best mean reward: -66.35 - Last mean reward per episode: -66.29 - Last mean moves per episode: 22.87\n",
"Saving new best model\n",
"449996 timesteps\n",
"Best mean reward: -66.29 - Last mean reward per episode: -65.80 - Last mean moves per episode: 22.85\n",
"Saving new best model\n",
"459982 timesteps\n",
"Best mean reward: -65.80 - Last mean reward per episode: -65.50 - Last mean moves per episode: 22.84\n",
"Saving new best model\n",
"469974 timesteps\n",
"Best mean reward: -65.50 - Last mean reward per episode: -65.45 - Last mean moves per episode: 22.84\n",
"Saving new best model\n",
"479977 timesteps\n",
"Best mean reward: -65.45 - Last mean reward per episode: -65.05 - Last mean moves per episode: 22.84\n",
"Saving new best model\n",
"489998 timesteps\n",
"Best mean reward: -65.05 - Last mean reward per episode: -64.33 - Last mean moves per episode: 22.78\n",
"Saving new best model\n",
"499997 timesteps\n",
"Best mean reward: -64.33 - Last mean reward per episode: -63.77 - Last mean moves per episode: 22.74\n",
"Saving new best model\n",
"509982 timesteps\n",
"Best mean reward: -63.77 - Last mean reward per episode: -63.36 - Last mean moves per episode: 22.70\n",
"Saving new best model\n",
"519986 timesteps\n",
"Best mean reward: -63.36 - Last mean reward per episode: -63.02 - Last mean moves per episode: 22.68\n",
"Saving new best model\n",
"529988 timesteps\n",
"Best mean reward: -63.02 - Last mean reward per episode: -62.68 - Last mean moves per episode: 22.66\n",
"Saving new best model\n",
"539988 timesteps\n",
"Best mean reward: -62.68 - Last mean reward per episode: -62.34 - Last mean moves per episode: 22.66\n",
"Saving new best model\n",
"549970 timesteps\n",
"Best mean reward: -62.34 - Last mean reward per episode: -61.56 - Last mean moves per episode: 22.63\n",
"Saving new best model\n",
"559995 timesteps\n",
"Best mean reward: -61.56 - Last mean reward per episode: -61.38 - Last mean moves per episode: 22.64\n",
"Saving new best model\n",
"569992 timesteps\n",
"Best mean reward: -61.38 - Last mean reward per episode: -61.43 - Last mean moves per episode: 22.64\n",
"579991 timesteps\n",
"Best mean reward: -61.38 - Last mean reward per episode: -61.19 - Last mean moves per episode: 22.63\n",
"Saving new best model\n",
"589978 timesteps\n",
"Best mean reward: -61.19 - Last mean reward per episode: -61.01 - Last mean moves per episode: 22.66\n",
"Saving new best model\n",
"599981 timesteps\n",
"Best mean reward: -61.01 - Last mean reward per episode: -60.01 - Last mean moves per episode: 22.61\n",
"Saving new best model\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"609983 timesteps\n",
"Best mean reward: -60.01 - Last mean reward per episode: -59.90 - Last mean moves per episode: 22.64\n",
"Saving new best model\n",
"619990 timesteps\n",
"Best mean reward: -59.90 - Last mean reward per episode: -59.27 - Last mean moves per episode: 22.57\n",
"Saving new best model\n",
"629988 timesteps\n",
"Best mean reward: -59.27 - Last mean reward per episode: -58.82 - Last mean moves per episode: 22.51\n",
"Saving new best model\n",
"639985 timesteps\n",
"Best mean reward: -58.82 - Last mean reward per episode: -58.70 - Last mean moves per episode: 22.48\n",
"Saving new best model\n",
"650000 timesteps\n",
"Best mean reward: -58.70 - Last mean reward per episode: -58.65 - Last mean moves per episode: 22.45\n",
"Saving new best model\n",
"659969 timesteps\n",
"Best mean reward: -58.65 - Last mean reward per episode: -58.28 - Last mean moves per episode: 22.45\n",
"Saving new best model\n",
"669995 timesteps\n",
"Best mean reward: -58.28 - Last mean reward per episode: -57.94 - Last mean moves per episode: 22.44\n",
"Saving new best model\n",
"679998 timesteps\n",
"Best mean reward: -57.94 - Last mean reward per episode: -57.73 - Last mean moves per episode: 22.45\n",
"Saving new best model\n",
"689971 timesteps\n",
"Best mean reward: -57.73 - Last mean reward per episode: -57.73 - Last mean moves per episode: 22.48\n",
"699998 timesteps\n",
"Best mean reward: -57.73 - Last mean reward per episode: -57.54 - Last mean moves per episode: 22.50\n",
"Saving new best model\n",
"709994 timesteps\n",
"Best mean reward: -57.54 - Last mean reward per episode: -57.05 - Last mean moves per episode: 22.44\n",
"Saving new best model\n",
"719996 timesteps\n",
"Best mean reward: -57.05 - Last mean reward per episode: -57.12 - Last mean moves per episode: 22.45\n",
"729996 timesteps\n",
"Best mean reward: -57.05 - Last mean reward per episode: -56.77 - Last mean moves per episode: 22.45\n",
"Saving new best model\n",
"739984 timesteps\n",
"Best mean reward: -56.77 - Last mean reward per episode: -56.59 - Last mean moves per episode: 22.46\n",
"Saving new best model\n",
"749982 timesteps\n",
"Best mean reward: -56.59 - Last mean reward per episode: -55.95 - Last mean moves per episode: 22.40\n",
"Saving new best model\n",
"759986 timesteps\n",
"Best mean reward: -55.95 - Last mean reward per episode: -55.64 - Last mean moves per episode: 22.40\n",
"Saving new best model\n",
"770000 timesteps\n",
"Best mean reward: -55.64 - Last mean reward per episode: -54.87 - Last mean moves per episode: 22.37\n",
"Saving new best model\n",
"779973 timesteps\n",
"Best mean reward: -54.87 - Last mean reward per episode: -54.60 - Last mean moves per episode: 22.32\n",
"Saving new best model\n",
"789974 timesteps\n",
"Best mean reward: -54.60 - Last mean reward per episode: -54.07 - Last mean moves per episode: 22.31\n",
"Saving new best model\n",
"799972 timesteps\n",
"Best mean reward: -54.07 - Last mean reward per episode: -53.31 - Last mean moves per episode: 22.27\n",
"Saving new best model\n",
"809974 timesteps\n",
"Best mean reward: -53.31 - Last mean reward per episode: -52.76 - Last mean moves per episode: 22.22\n",
"Saving new best model\n",
"819987 timesteps\n",
"Best mean reward: -52.76 - Last mean reward per episode: -52.52 - Last mean moves per episode: 22.22\n",
"Saving new best model\n",
"829980 timesteps\n",
"Best mean reward: -52.52 - Last mean reward per episode: -52.31 - Last mean moves per episode: 22.24\n",
"Saving new best model\n",
"839988 timesteps\n",
"Best mean reward: -52.31 - Last mean reward per episode: -51.79 - Last mean moves per episode: 22.24\n",
"Saving new best model\n",
"849977 timesteps\n",
"Best mean reward: -51.79 - Last mean reward per episode: -52.17 - Last mean moves per episode: 22.33\n",
"859993 timesteps\n",
"Best mean reward: -51.79 - Last mean reward per episode: -50.95 - Last mean moves per episode: 22.26\n",
"Saving new best model\n",
"869982 timesteps\n",
"Best mean reward: -50.95 - Last mean reward per episode: -50.38 - Last mean moves per episode: 22.25\n",
"Saving new best model\n",
"879996 timesteps\n",
"Best mean reward: -50.38 - Last mean reward per episode: -49.62 - Last mean moves per episode: 22.20\n",
"Saving new best model\n",
"889991 timesteps\n",
"Best mean reward: -49.62 - Last mean reward per episode: -49.72 - Last mean moves per episode: 22.23\n",
"899986 timesteps\n",
"Best mean reward: -49.62 - Last mean reward per episode: -49.42 - Last mean moves per episode: 22.22\n",
"Saving new best model\n",
"909995 timesteps\n",
"Best mean reward: -49.42 - Last mean reward per episode: -48.53 - Last mean moves per episode: 22.17\n",
"Saving new best model\n",
"919999 timesteps\n",
"Best mean reward: -48.53 - Last mean reward per episode: -47.84 - Last mean moves per episode: 22.11\n",
"Saving new best model\n",
"929984 timesteps\n",
"Best mean reward: -47.84 - Last mean reward per episode: -47.19 - Last mean moves per episode: 22.10\n",
"Saving new best model\n",
"939999 timesteps\n",
"Best mean reward: -47.19 - Last mean reward per episode: -46.65 - Last mean moves per episode: 22.09\n",
"Saving new best model\n",
"949986 timesteps\n",
"Best mean reward: -46.65 - Last mean reward per episode: -46.05 - Last mean moves per episode: 22.05\n",
"Saving new best model\n",
"959977 timesteps\n",
"Best mean reward: -46.05 - Last mean reward per episode: -45.21 - Last mean moves per episode: 22.02\n",
"Saving new best model\n",
"969999 timesteps\n",
"Best mean reward: -45.21 - Last mean reward per episode: -44.90 - Last mean moves per episode: 22.00\n",
"Saving new best model\n",
"979999 timesteps\n",
"Best mean reward: -44.90 - Last mean reward per episode: -44.32 - Last mean moves per episode: 21.95\n",
"Saving new best model\n",
"989995 timesteps\n",
"Best mean reward: -44.32 - Last mean reward per episode: -44.12 - Last mean moves per episode: 21.96\n",
"Saving new best model\n",
"999986 timesteps\n",
"Best mean reward: -44.12 - Last mean reward per episode: -43.76 - Last mean moves per episode: 21.99\n",
"Saving new best model\n",
"1009987 timesteps\n",
"Best mean reward: -43.76 - Last mean reward per episode: -43.26 - Last mean moves per episode: 21.96\n",
"Saving new best model\n",
"1019993 timesteps\n",
"Best mean reward: -43.26 - Last mean reward per episode: -42.69 - Last mean moves per episode: 21.92\n",
"Saving new best model\n",
"1029996 timesteps\n",
"Best mean reward: -42.69 - Last mean reward per episode: -41.97 - Last mean moves per episode: 21.86\n",
"Saving new best model\n",
"1039990 timesteps\n",
"Best mean reward: -41.97 - Last mean reward per episode: -40.98 - Last mean moves per episode: 21.75\n",
"Saving new best model\n",
"1049988 timesteps\n",
"Best mean reward: -40.98 - Last mean reward per episode: -40.19 - Last mean moves per episode: 21.65\n",
"Saving new best model\n",
"1059987 timesteps\n",
"Best mean reward: -40.19 - Last mean reward per episode: -39.28 - Last mean moves per episode: 21.58\n",
"Saving new best model\n",
"1069997 timesteps\n",
"Best mean reward: -39.28 - Last mean reward per episode: -38.45 - Last mean moves per episode: 21.48\n",
"Saving new best model\n",
"1079989 timesteps\n",
"Best mean reward: -38.45 - Last mean reward per episode: -38.14 - Last mean moves per episode: 21.45\n",
"Saving new best model\n",
"1089969 timesteps\n",
"Best mean reward: -38.14 - Last mean reward per episode: -37.69 - Last mean moves per episode: 21.40\n",
"Saving new best model\n",
"1099991 timesteps\n",
"Best mean reward: -37.69 - Last mean reward per episode: -37.41 - Last mean moves per episode: 21.35\n",
"Saving new best model\n",
"1109996 timesteps\n",
"Best mean reward: -37.41 - Last mean reward per episode: -36.95 - Last mean moves per episode: 21.28\n",
"Saving new best model\n",
"1119992 timesteps\n",
"Best mean reward: -36.95 - Last mean reward per episode: -36.35 - Last mean moves per episode: 21.21\n",
"Saving new best model\n",
"1129986 timesteps\n",
"Best mean reward: -36.35 - Last mean reward per episode: -35.38 - Last mean moves per episode: 21.10\n",
"Saving new best model\n",
"1139987 timesteps\n",
"Best mean reward: -35.38 - Last mean reward per episode: -35.54 - Last mean moves per episode: 21.10\n",
"1149984 timesteps\n",
"Best mean reward: -35.38 - Last mean reward per episode: -35.07 - Last mean moves per episode: 21.02\n",
"Saving new best model\n",
"1159993 timesteps\n",
"Best mean reward: -35.07 - Last mean reward per episode: -34.78 - Last mean moves per episode: 20.97\n",
"Saving new best model\n",
"1169983 timesteps\n",
"Best mean reward: -34.78 - Last mean reward per episode: -34.31 - Last mean moves per episode: 20.89\n",
"Saving new best model\n",
"1179987 timesteps\n",
"Best mean reward: -34.31 - Last mean reward per episode: -34.09 - Last mean moves per episode: 20.86\n",
"Saving new best model\n",
"1189995 timesteps\n",
"Best mean reward: -34.09 - Last mean reward per episode: -34.35 - Last mean moves per episode: 20.83\n",
"1199996 timesteps\n",
"Best mean reward: -34.09 - Last mean reward per episode: -33.12 - Last mean moves per episode: 20.64\n",
"Saving new best model\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"1209997 timesteps\n",
"Best mean reward: -33.12 - Last mean reward per episode: -32.45 - Last mean moves per episode: 20.54\n",
"Saving new best model\n",
"1219982 timesteps\n",
"Best mean reward: -32.45 - Last mean reward per episode: -31.67 - Last mean moves per episode: 20.41\n",
"Saving new best model\n",
"1229986 timesteps\n",
"Best mean reward: -31.67 - Last mean reward per episode: -31.61 - Last mean moves per episode: 20.38\n",
"Saving new best model\n",
"1239987 timesteps\n",
"Best mean reward: -31.61 - Last mean reward per episode: -31.03 - Last mean moves per episode: 20.28\n",
"Saving new best model\n",
"1249987 timesteps\n",
"Best mean reward: -31.03 - Last mean reward per episode: -30.58 - Last mean moves per episode: 20.18\n",
"Saving new best model\n",
"1259993 timesteps\n",
"Best mean reward: -30.58 - Last mean reward per episode: -30.15 - Last mean moves per episode: 20.07\n",
"Saving new best model\n",
"1269999 timesteps\n",
"Best mean reward: -30.15 - Last mean reward per episode: -29.86 - Last mean moves per episode: 19.99\n",
"Saving new best model\n",
"1279978 timesteps\n",
"Best mean reward: -29.86 - Last mean reward per episode: -29.30 - Last mean moves per episode: 19.91\n",
"Saving new best model\n",
"1289999 timesteps\n",
"Best mean reward: -29.30 - Last mean reward per episode: -29.38 - Last mean moves per episode: 19.85\n",
"1299975 timesteps\n",
"Best mean reward: -29.30 - Last mean reward per episode: -29.58 - Last mean moves per episode: 19.82\n",
"1309981 timesteps\n",
"Best mean reward: -29.30 - Last mean reward per episode: -29.27 - Last mean moves per episode: 19.73\n",
"Saving new best model\n",
"1319997 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -30.11 - Last mean moves per episode: 19.73\n",
"1329998 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -30.51 - Last mean moves per episode: 19.72\n",
"1339989 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -30.54 - Last mean moves per episode: 19.66\n",
"1349993 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -30.66 - Last mean moves per episode: 19.59\n",
"1359998 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -31.14 - Last mean moves per episode: 19.59\n",
"1369999 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -31.52 - Last mean moves per episode: 19.57\n",
"1379983 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -30.43 - Last mean moves per episode: 19.47\n",
"1389994 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -30.40 - Last mean moves per episode: 19.46\n",
"1400000 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -30.93 - Last mean moves per episode: 19.46\n",
"1410000 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -31.39 - Last mean moves per episode: 19.47\n",
"1419992 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -31.39 - Last mean moves per episode: 19.43\n",
"1429996 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -31.85 - Last mean moves per episode: 19.48\n",
"1439990 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -32.04 - Last mean moves per episode: 19.49\n",
"1449994 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -32.73 - Last mean moves per episode: 19.58\n",
"1459985 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -32.92 - Last mean moves per episode: 19.64\n",
"1469976 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -33.26 - Last mean moves per episode: 19.67\n",
"1479997 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -33.41 - Last mean moves per episode: 19.65\n",
"1489981 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -33.91 - Last mean moves per episode: 19.70\n",
"1499988 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -33.51 - Last mean moves per episode: 19.69\n",
"1510000 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -33.91 - Last mean moves per episode: 19.74\n",
"1519995 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -34.15 - Last mean moves per episode: 19.77\n",
"1529998 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -34.16 - Last mean moves per episode: 19.78\n",
"1539988 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -34.57 - Last mean moves per episode: 19.83\n",
"1549976 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -34.84 - Last mean moves per episode: 19.89\n",
"1559999 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -34.65 - Last mean moves per episode: 19.87\n",
"1569981 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -35.64 - Last mean moves per episode: 19.95\n",
"1579991 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -36.81 - Last mean moves per episode: 20.03\n",
"1589995 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -37.79 - Last mean moves per episode: 20.10\n",
"1599999 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -38.56 - Last mean moves per episode: 20.17\n",
"1609979 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -39.42 - Last mean moves per episode: 20.26\n",
"1619998 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -39.83 - Last mean moves per episode: 20.33\n",
"1629971 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -40.46 - Last mean moves per episode: 20.37\n",
"1639986 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -40.37 - Last mean moves per episode: 20.39\n",
"1649987 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -40.76 - Last mean moves per episode: 20.40\n",
"1659998 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -40.50 - Last mean moves per episode: 20.38\n",
"1669998 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -40.82 - Last mean moves per episode: 20.41\n",
"1679993 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -40.01 - Last mean moves per episode: 20.33\n",
"1689999 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -39.16 - Last mean moves per episode: 20.25\n",
"1699991 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -38.91 - Last mean moves per episode: 20.21\n",
"1709998 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -38.43 - Last mean moves per episode: 20.16\n",
"1719990 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -37.10 - Last mean moves per episode: 20.05\n",
"1729987 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -36.15 - Last mean moves per episode: 19.96\n",
"1739994 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -35.21 - Last mean moves per episode: 19.89\n",
"1749994 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -33.31 - Last mean moves per episode: 19.71\n",
"1759999 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -32.08 - Last mean moves per episode: 19.61\n",
"1769997 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -30.44 - Last mean moves per episode: 19.48\n",
"1779996 timesteps\n",
"Best mean reward: -29.27 - Last mean reward per episode: -28.92 - Last mean moves per episode: 19.34\n",
"Saving new best model\n",
"1789984 timesteps\n",
"Best mean reward: -28.92 - Last mean reward per episode: -27.63 - Last mean moves per episode: 19.22\n",
"Saving new best model\n",
"1799983 timesteps\n",
"Best mean reward: -27.63 - Last mean reward per episode: -25.92 - Last mean moves per episode: 19.03\n",
"Saving new best model\n",
"1809980 timesteps\n",
"Best mean reward: -25.92 - Last mean reward per episode: -24.89 - Last mean moves per episode: 18.89\n",
"Saving new best model\n",
"1819995 timesteps\n",
"Best mean reward: -24.89 - Last mean reward per episode: -23.37 - Last mean moves per episode: 18.66\n",
"Saving new best model\n",
"1830000 timesteps\n",
"Best mean reward: -23.37 - Last mean reward per episode: -22.23 - Last mean moves per episode: 18.47\n",
"Saving new best model\n",
"1840000 timesteps\n",
"Best mean reward: -22.23 - Last mean reward per episode: -21.40 - Last mean moves per episode: 18.34\n",
"Saving new best model\n",
"1849996 timesteps\n",
"Best mean reward: -21.40 - Last mean reward per episode: -20.54 - Last mean moves per episode: 18.21\n",
"Saving new best model\n",
"1860000 timesteps\n",
"Best mean reward: -20.54 - Last mean reward per episode: -19.86 - Last mean moves per episode: 18.11\n",
"Saving new best model\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"1869995 timesteps\n",
"Best mean reward: -19.86 - Last mean reward per episode: -19.34 - Last mean moves per episode: 18.01\n",
"Saving new best model\n",
"1879999 timesteps\n",
"Best mean reward: -19.34 - Last mean reward per episode: -18.20 - Last mean moves per episode: 17.86\n",
"Saving new best model\n",
"1889993 timesteps\n",
"Best mean reward: -18.20 - Last mean reward per episode: -17.04 - Last mean moves per episode: 17.72\n",
"Saving new best model\n",
"1899991 timesteps\n",
"Best mean reward: -17.04 - Last mean reward per episode: -16.06 - Last mean moves per episode: 17.56\n",
"Saving new best model\n",
"1909985 timesteps\n",
"Best mean reward: -16.06 - Last mean reward per episode: -14.49 - Last mean moves per episode: 17.35\n",
"Saving new best model\n",
"1919994 timesteps\n",
"Best mean reward: -14.49 - Last mean reward per episode: -14.02 - Last mean moves per episode: 17.24\n",
"Saving new best model\n",
"1929979 timesteps\n",
"Best mean reward: -14.02 - Last mean reward per episode: -13.38 - Last mean moves per episode: 17.13\n",
"Saving new best model\n",
"1939971 timesteps\n",
"Best mean reward: -13.38 - Last mean reward per episode: -12.44 - Last mean moves per episode: 16.95\n",
"Saving new best model\n",
"1949999 timesteps\n",
"Best mean reward: -12.44 - Last mean reward per episode: -11.17 - Last mean moves per episode: 16.77\n",
"Saving new best model\n",
"1959985 timesteps\n",
"Best mean reward: -11.17 - Last mean reward per episode: -10.49 - Last mean moves per episode: 16.65\n",
"Saving new best model\n",
"1969997 timesteps\n",
"Best mean reward: -10.49 - Last mean reward per episode: -9.61 - Last mean moves per episode: 16.54\n",
"Saving new best model\n",
"1979997 timesteps\n",
"Best mean reward: -9.61 - Last mean reward per episode: -9.30 - Last mean moves per episode: 16.51\n",
"Saving new best model\n",
"1989986 timesteps\n",
"Best mean reward: -9.30 - Last mean reward per episode: -8.21 - Last mean moves per episode: 16.35\n",
"Saving new best model\n",
"1999996 timesteps\n",
"Best mean reward: -8.21 - Last mean reward per episode: -7.90 - Last mean moves per episode: 16.27\n",
"Saving new best model\n",
"2009980 timesteps\n",
"Best mean reward: -7.90 - Last mean reward per episode: -6.57 - Last mean moves per episode: 16.05\n",
"Saving new best model\n",
"2019987 timesteps\n",
"Best mean reward: -6.57 - Last mean reward per episode: -5.78 - Last mean moves per episode: 15.90\n",
"Saving new best model\n",
"2029982 timesteps\n",
"Best mean reward: -5.78 - Last mean reward per episode: -4.32 - Last mean moves per episode: 15.69\n",
"Saving new best model\n",
"2039992 timesteps\n",
"Best mean reward: -4.32 - Last mean reward per episode: -3.52 - Last mean moves per episode: 15.52\n",
"Saving new best model\n",
"2049988 timesteps\n",
"Best mean reward: -3.52 - Last mean reward per episode: -2.98 - Last mean moves per episode: 15.40\n",
"Saving new best model\n",
"2059989 timesteps\n",
"Best mean reward: -2.98 - Last mean reward per episode: -2.59 - Last mean moves per episode: 15.27\n",
"Saving new best model\n",
"2069993 timesteps\n",
"Best mean reward: -2.59 - Last mean reward per episode: -2.01 - Last mean moves per episode: 15.10\n",
"Saving new best model\n",
"2079979 timesteps\n",
"Best mean reward: -2.01 - Last mean reward per episode: -1.47 - Last mean moves per episode: 14.93\n",
"Saving new best model\n",
"2089998 timesteps\n",
"Best mean reward: -1.47 - Last mean reward per episode: -1.01 - Last mean moves per episode: 14.76\n",
"Saving new best model\n",
"2099992 timesteps\n",
"Best mean reward: -1.01 - Last mean reward per episode: -0.68 - Last mean moves per episode: 14.63\n",
"Saving new best model\n",
"2109982 timesteps\n",
"Best mean reward: -0.68 - Last mean reward per episode: -0.37 - Last mean moves per episode: 14.51\n",
"Saving new best model\n",
"2119995 timesteps\n",
"Best mean reward: -0.37 - Last mean reward per episode: -0.07 - Last mean moves per episode: 14.38\n",
"Saving new best model\n",
"2129988 timesteps\n",
"Best mean reward: -0.07 - Last mean reward per episode: 0.44 - Last mean moves per episode: 14.26\n",
"Saving new best model\n",
"2139996 timesteps\n",
"Best mean reward: 0.44 - Last mean reward per episode: 1.26 - Last mean moves per episode: 14.15\n",
"Saving new best model\n",
"2150000 timesteps\n",
"Best mean reward: 1.26 - Last mean reward per episode: 1.45 - Last mean moves per episode: 14.10\n",
"Saving new best model\n",
"2159984 timesteps\n",
"Best mean reward: 1.45 - Last mean reward per episode: 1.47 - Last mean moves per episode: 14.08\n",
"Saving new best model\n",
"2170000 timesteps\n",
"Best mean reward: 1.47 - Last mean reward per episode: 1.38 - Last mean moves per episode: 14.06\n",
"2179977 timesteps\n",
"Best mean reward: 1.47 - Last mean reward per episode: 1.48 - Last mean moves per episode: 14.00\n",
"Saving new best model\n",
"2189995 timesteps\n",
"Best mean reward: 1.48 - Last mean reward per episode: 1.72 - Last mean moves per episode: 13.96\n",
"Saving new best model\n",
"2199993 timesteps\n",
"Best mean reward: 1.72 - Last mean reward per episode: 1.99 - Last mean moves per episode: 13.92\n",
"Saving new best model\n",
"2209992 timesteps\n",
"Best mean reward: 1.99 - Last mean reward per episode: 1.82 - Last mean moves per episode: 13.95\n",
"2219996 timesteps\n",
"Best mean reward: 1.99 - Last mean reward per episode: 1.42 - Last mean moves per episode: 14.01\n",
"2229995 timesteps\n",
"Best mean reward: 1.99 - Last mean reward per episode: 1.76 - Last mean moves per episode: 14.02\n",
"2239998 timesteps\n",
"Best mean reward: 1.99 - Last mean reward per episode: 2.69 - Last mean moves per episode: 13.93\n",
"Saving new best model\n",
"2249986 timesteps\n",
"Best mean reward: 2.69 - Last mean reward per episode: 3.67 - Last mean moves per episode: 13.84\n",
"Saving new best model\n",
"2259988 timesteps\n",
"Best mean reward: 3.67 - Last mean reward per episode: 4.12 - Last mean moves per episode: 13.79\n",
"Saving new best model\n",
"2269979 timesteps\n",
"Best mean reward: 4.12 - Last mean reward per episode: 4.69 - Last mean moves per episode: 13.70\n",
"Saving new best model\n",
"2279996 timesteps\n",
"Best mean reward: 4.69 - Last mean reward per episode: 5.13 - Last mean moves per episode: 13.63\n",
"Saving new best model\n",
"2289997 timesteps\n",
"Best mean reward: 5.13 - Last mean reward per episode: 5.71 - Last mean moves per episode: 13.55\n",
"Saving new best model\n",
"2299984 timesteps\n",
"Best mean reward: 5.71 - Last mean reward per episode: 6.00 - Last mean moves per episode: 13.45\n",
"Saving new best model\n",
"2309992 timesteps\n",
"Best mean reward: 6.00 - Last mean reward per episode: 6.21 - Last mean moves per episode: 13.40\n",
"Saving new best model\n",
"2319985 timesteps\n",
"Best mean reward: 6.21 - Last mean reward per episode: 6.56 - Last mean moves per episode: 13.30\n",
"Saving new best model\n",
"2329985 timesteps\n",
"Best mean reward: 6.56 - Last mean reward per episode: 6.69 - Last mean moves per episode: 13.22\n",
"Saving new best model\n",
"2339975 timesteps\n",
"Best mean reward: 6.69 - Last mean reward per episode: 7.55 - Last mean moves per episode: 13.08\n",
"Saving new best model\n",
"2349985 timesteps\n",
"Best mean reward: 7.55 - Last mean reward per episode: 8.52 - Last mean moves per episode: 12.92\n",
"Saving new best model\n",
"2359995 timesteps\n",
"Best mean reward: 8.52 - Last mean reward per episode: 8.72 - Last mean moves per episode: 12.84\n",
"Saving new best model\n",
"2369999 timesteps\n",
"Best mean reward: 8.72 - Last mean reward per episode: 8.69 - Last mean moves per episode: 12.80\n",
"2379994 timesteps\n",
"Best mean reward: 8.72 - Last mean reward per episode: 8.91 - Last mean moves per episode: 12.70\n",
"Saving new best model\n",
"2389978 timesteps\n",
"Best mean reward: 8.91 - Last mean reward per episode: 9.01 - Last mean moves per episode: 12.64\n",
"Saving new best model\n",
"2399994 timesteps\n",
"Best mean reward: 9.01 - Last mean reward per episode: 9.63 - Last mean moves per episode: 12.52\n",
"Saving new best model\n",
"2409998 timesteps\n",
"Best mean reward: 9.63 - Last mean reward per episode: 9.99 - Last mean moves per episode: 12.47\n",
"Saving new best model\n",
"2419991 timesteps\n",
"Best mean reward: 9.99 - Last mean reward per episode: 10.21 - Last mean moves per episode: 12.46\n",
"Saving new best model\n",
"2429981 timesteps\n",
"Best mean reward: 10.21 - Last mean reward per episode: 10.92 - Last mean moves per episode: 12.40\n",
"Saving new best model\n",
"2439998 timesteps\n",
"Best mean reward: 10.92 - Last mean reward per episode: 11.50 - Last mean moves per episode: 12.37\n",
"Saving new best model\n",
"2449995 timesteps\n",
"Best mean reward: 11.50 - Last mean reward per episode: 11.91 - Last mean moves per episode: 12.31\n",
"Saving new best model\n",
"2459992 timesteps\n",
"Best mean reward: 11.91 - Last mean reward per episode: 12.38 - Last mean moves per episode: 12.29\n",
"Saving new best model\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"2469994 timesteps\n",
"Best mean reward: 12.38 - Last mean reward per episode: 12.32 - Last mean moves per episode: 12.30\n",
"2479998 timesteps\n",
"Best mean reward: 12.38 - Last mean reward per episode: 12.39 - Last mean moves per episode: 12.31\n",
"Saving new best model\n",
"2490000 timesteps\n",
"Best mean reward: 12.39 - Last mean reward per episode: 12.48 - Last mean moves per episode: 12.31\n",
"Saving new best model\n",
"2499978 timesteps\n",
"Best mean reward: 12.48 - Last mean reward per episode: 12.70 - Last mean moves per episode: 12.35\n",
"Saving new best model\n",
"2509989 timesteps\n",
"Best mean reward: 12.70 - Last mean reward per episode: 13.20 - Last mean moves per episode: 12.28\n",
"Saving new best model\n",
"2520000 timesteps\n",
"Best mean reward: 13.20 - Last mean reward per episode: 13.05 - Last mean moves per episode: 12.31\n",
"2529983 timesteps\n",
"Best mean reward: 13.20 - Last mean reward per episode: 13.61 - Last mean moves per episode: 12.23\n",
"Saving new best model\n",
"2539996 timesteps\n",
"Best mean reward: 13.61 - Last mean reward per episode: 13.64 - Last mean moves per episode: 12.21\n",
"Saving new best model\n",
"2549976 timesteps\n",
"Best mean reward: 13.64 - Last mean reward per episode: 12.57 - Last mean moves per episode: 12.33\n",
"2560000 timesteps\n",
"Best mean reward: 13.64 - Last mean reward per episode: 12.93 - Last mean moves per episode: 12.26\n",
"2569992 timesteps\n",
"Best mean reward: 13.64 - Last mean reward per episode: 13.05 - Last mean moves per episode: 12.24\n",
"2579999 timesteps\n",
"Best mean reward: 13.64 - Last mean reward per episode: 12.87 - Last mean moves per episode: 12.25\n",
"2589999 timesteps\n",
"Best mean reward: 13.64 - Last mean reward per episode: 13.28 - Last mean moves per episode: 12.15\n",
"2599974 timesteps\n",
"Best mean reward: 13.64 - Last mean reward per episode: 13.88 - Last mean moves per episode: 12.08\n",
"Saving new best model\n",
"2609971 timesteps\n",
"Best mean reward: 13.88 - Last mean reward per episode: 14.43 - Last mean moves per episode: 11.98\n",
"Saving new best model\n",
"2619998 timesteps\n",
"Best mean reward: 14.43 - Last mean reward per episode: 14.44 - Last mean moves per episode: 11.96\n",
"Saving new best model\n",
"2629995 timesteps\n",
"Best mean reward: 14.44 - Last mean reward per episode: 14.69 - Last mean moves per episode: 11.93\n",
"Saving new best model\n",
"2639994 timesteps\n",
"Best mean reward: 14.69 - Last mean reward per episode: 15.04 - Last mean moves per episode: 11.89\n",
"Saving new best model\n",
"2649998 timesteps\n",
"Best mean reward: 15.04 - Last mean reward per episode: 15.16 - Last mean moves per episode: 11.88\n",
"Saving new best model\n",
"2659995 timesteps\n",
"Best mean reward: 15.16 - Last mean reward per episode: 15.90 - Last mean moves per episode: 11.80\n",
"Saving new best model\n",
"2669994 timesteps\n",
"Best mean reward: 15.90 - Last mean reward per episode: 16.74 - Last mean moves per episode: 11.70\n",
"Saving new best model\n",
"2679989 timesteps\n",
"Best mean reward: 16.74 - Last mean reward per episode: 17.04 - Last mean moves per episode: 11.66\n",
"Saving new best model\n",
"2689970 timesteps\n",
"Best mean reward: 17.04 - Last mean reward per episode: 17.34 - Last mean moves per episode: 11.66\n",
"Saving new best model\n",
"2699991 timesteps\n",
"Best mean reward: 17.34 - Last mean reward per episode: 17.62 - Last mean moves per episode: 11.62\n",
"Saving new best model\n",
"2709999 timesteps\n",
"Best mean reward: 17.62 - Last mean reward per episode: 17.62 - Last mean moves per episode: 11.63\n",
"2719990 timesteps\n",
"Best mean reward: 17.62 - Last mean reward per episode: 17.97 - Last mean moves per episode: 11.59\n",
"Saving new best model\n",
"2729993 timesteps\n",
"Best mean reward: 17.97 - Last mean reward per episode: 18.29 - Last mean moves per episode: 11.54\n",
"Saving new best model\n",
"2739982 timesteps\n",
"Best mean reward: 18.29 - Last mean reward per episode: 18.29 - Last mean moves per episode: 11.54\n",
"Saving new best model\n",
"2749996 timesteps\n",
"Best mean reward: 18.29 - Last mean reward per episode: 18.09 - Last mean moves per episode: 11.55\n",
"2759999 timesteps\n",
"Best mean reward: 18.29 - Last mean reward per episode: 18.44 - Last mean moves per episode: 11.47\n",
"Saving new best model\n",
"2769969 timesteps\n",
"Best mean reward: 18.44 - Last mean reward per episode: 18.48 - Last mean moves per episode: 11.43\n",
"Saving new best model\n",
"2779987 timesteps\n",
"Best mean reward: 18.48 - Last mean reward per episode: 18.50 - Last mean moves per episode: 11.42\n",
"Saving new best model\n",
"2789997 timesteps\n",
"Best mean reward: 18.50 - Last mean reward per episode: 18.78 - Last mean moves per episode: 11.37\n",
"Saving new best model\n",
"2799976 timesteps\n",
"Best mean reward: 18.78 - Last mean reward per episode: 18.78 - Last mean moves per episode: 11.35\n",
"Saving new best model\n",
"2809999 timesteps\n",
"Best mean reward: 18.78 - Last mean reward per episode: 18.76 - Last mean moves per episode: 11.35\n",
"2819994 timesteps\n",
"Best mean reward: 18.78 - Last mean reward per episode: 19.16 - Last mean moves per episode: 11.32\n",
"Saving new best model\n",
"2829993 timesteps\n",
"Best mean reward: 19.16 - Last mean reward per episode: 19.29 - Last mean moves per episode: 11.29\n",
"Saving new best model\n",
"2839988 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 18.80 - Last mean moves per episode: 11.32\n",
"2849998 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 19.01 - Last mean moves per episode: 11.28\n",
"2859993 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 19.07 - Last mean moves per episode: 11.28\n",
"2869997 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 19.21 - Last mean moves per episode: 11.26\n",
"2879984 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 18.66 - Last mean moves per episode: 11.33\n",
"2889971 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 18.86 - Last mean moves per episode: 11.29\n",
"2899996 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 18.91 - Last mean moves per episode: 11.27\n",
"2909981 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 18.69 - Last mean moves per episode: 11.28\n",
"2919990 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 18.91 - Last mean moves per episode: 11.23\n",
"2929992 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 18.77 - Last mean moves per episode: 11.20\n",
"2939998 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 18.39 - Last mean moves per episode: 11.24\n",
"2949999 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 18.61 - Last mean moves per episode: 11.21\n",
"2959993 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 18.26 - Last mean moves per episode: 11.21\n",
"2969977 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 18.24 - Last mean moves per episode: 11.18\n",
"2979987 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 17.96 - Last mean moves per episode: 11.19\n",
"2989996 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 18.28 - Last mean moves per episode: 11.15\n",
"2999997 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 18.28 - Last mean moves per episode: 11.15\n",
"3009988 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 18.29 - Last mean moves per episode: 11.14\n",
"3019996 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 18.36 - Last mean moves per episode: 11.12\n",
"3029978 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 17.92 - Last mean moves per episode: 11.17\n",
"3039998 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 17.49 - Last mean moves per episode: 11.23\n",
"3049997 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 17.82 - Last mean moves per episode: 11.18\n",
"3059999 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 17.92 - Last mean moves per episode: 11.16\n",
"3069997 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 18.61 - Last mean moves per episode: 11.09\n",
"3079992 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 18.66 - Last mean moves per episode: 11.08\n",
"3089987 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 19.25 - Last mean moves per episode: 11.00\n",
"3099999 timesteps\n",
"Best mean reward: 19.29 - Last mean reward per episode: 19.62 - Last mean moves per episode: 10.93\n",
"Saving new best model\n",
"3109996 timesteps\n",
"Best mean reward: 19.62 - Last mean reward per episode: 20.10 - Last mean moves per episode: 10.86\n",
"Saving new best model\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"3119975 timesteps\n",
"Best mean reward: 20.10 - Last mean reward per episode: 20.25 - Last mean moves per episode: 10.85\n",
"Saving new best model\n",
"3129971 timesteps\n",
"Best mean reward: 20.25 - Last mean reward per episode: 20.36 - Last mean moves per episode: 10.83\n",
"Saving new best model\n",
"3139984 timesteps\n",
"Best mean reward: 20.36 - Last mean reward per episode: 20.90 - Last mean moves per episode: 10.78\n",
"Saving new best model\n",
"3149999 timesteps\n",
"Best mean reward: 20.90 - Last mean reward per episode: 21.19 - Last mean moves per episode: 10.77\n",
"Saving new best model\n",
"3159997 timesteps\n",
"Best mean reward: 21.19 - Last mean reward per episode: 21.32 - Last mean moves per episode: 10.76\n",
"Saving new best model\n",
"3169999 timesteps\n",
"Best mean reward: 21.32 - Last mean reward per episode: 21.84 - Last mean moves per episode: 10.70\n",
"Saving new best model\n",
"3179996 timesteps\n",
"Best mean reward: 21.84 - Last mean reward per episode: 21.98 - Last mean moves per episode: 10.70\n",
"Saving new best model\n",
"3189977 timesteps\n",
"Best mean reward: 21.98 - Last mean reward per episode: 22.36 - Last mean moves per episode: 10.68\n",
"Saving new best model\n",
"3199994 timesteps\n",
"Best mean reward: 22.36 - Last mean reward per episode: 22.71 - Last mean moves per episode: 10.65\n",
"Saving new best model\n",
"3209992 timesteps\n",
"Best mean reward: 22.71 - Last mean reward per episode: 22.71 - Last mean moves per episode: 10.68\n",
"3220000 timesteps\n",
"Best mean reward: 22.71 - Last mean reward per episode: 23.15 - Last mean moves per episode: 10.61\n",
"Saving new best model\n",
"3229990 timesteps\n",
"Best mean reward: 23.15 - Last mean reward per episode: 23.27 - Last mean moves per episode: 10.58\n",
"Saving new best model\n",
"3239997 timesteps\n",
"Best mean reward: 23.27 - Last mean reward per episode: 23.76 - Last mean moves per episode: 10.53\n",
"Saving new best model\n",
"3249998 timesteps\n",
"Best mean reward: 23.76 - Last mean reward per episode: 24.06 - Last mean moves per episode: 10.50\n",
"Saving new best model\n",
"3259993 timesteps\n",
"Best mean reward: 24.06 - Last mean reward per episode: 24.15 - Last mean moves per episode: 10.48\n",
"Saving new best model\n",
"3269988 timesteps\n",
"Best mean reward: 24.15 - Last mean reward per episode: 24.21 - Last mean moves per episode: 10.48\n",
"Saving new best model\n",
"3279995 timesteps\n",
"Best mean reward: 24.21 - Last mean reward per episode: 24.39 - Last mean moves per episode: 10.48\n",
"Saving new best model\n",
"3289993 timesteps\n",
"Best mean reward: 24.39 - Last mean reward per episode: 24.05 - Last mean moves per episode: 10.53\n",
"3299996 timesteps\n",
"Best mean reward: 24.39 - Last mean reward per episode: 24.61 - Last mean moves per episode: 10.47\n",
"Saving new best model\n",
"3310000 timesteps\n",
"Best mean reward: 24.61 - Last mean reward per episode: 24.47 - Last mean moves per episode: 10.51\n",
"3319992 timesteps\n",
"Best mean reward: 24.61 - Last mean reward per episode: 24.43 - Last mean moves per episode: 10.54\n",
"3329970 timesteps\n",
"Best mean reward: 24.61 - Last mean reward per episode: 24.23 - Last mean moves per episode: 10.57\n",
"3339997 timesteps\n",
"Best mean reward: 24.61 - Last mean reward per episode: 24.16 - Last mean moves per episode: 10.59\n",
"3349996 timesteps\n",
"Best mean reward: 24.61 - Last mean reward per episode: 24.40 - Last mean moves per episode: 10.56\n",
"3359987 timesteps\n",
"Best mean reward: 24.61 - Last mean reward per episode: 24.47 - Last mean moves per episode: 10.57\n",
"3369971 timesteps\n",
"Best mean reward: 24.61 - Last mean reward per episode: 24.73 - Last mean moves per episode: 10.55\n",
"Saving new best model\n",
"3379995 timesteps\n",
"Best mean reward: 24.73 - Last mean reward per episode: 24.51 - Last mean moves per episode: 10.58\n",
"3389997 timesteps\n",
"Best mean reward: 24.73 - Last mean reward per episode: 24.64 - Last mean moves per episode: 10.57\n",
"3399996 timesteps\n",
"Best mean reward: 24.73 - Last mean reward per episode: 24.70 - Last mean moves per episode: 10.59\n",
"3410000 timesteps\n",
"Best mean reward: 24.73 - Last mean reward per episode: 24.30 - Last mean moves per episode: 10.66\n",
"3419991 timesteps\n",
"Best mean reward: 24.73 - Last mean reward per episode: 24.29 - Last mean moves per episode: 10.68\n",
"3429998 timesteps\n",
"Best mean reward: 24.73 - Last mean reward per episode: 24.41 - Last mean moves per episode: 10.69\n",
"3439994 timesteps\n",
"Best mean reward: 24.73 - Last mean reward per episode: 24.27 - Last mean moves per episode: 10.71\n",
"3449974 timesteps\n",
"Best mean reward: 24.73 - Last mean reward per episode: 24.25 - Last mean moves per episode: 10.75\n",
"3459993 timesteps\n",
"Best mean reward: 24.73 - Last mean reward per episode: 24.21 - Last mean moves per episode: 10.76\n",
"3470000 timesteps\n",
"Best mean reward: 24.73 - Last mean reward per episode: 23.94 - Last mean moves per episode: 10.83\n",
"3479991 timesteps\n",
"Best mean reward: 24.73 - Last mean reward per episode: 23.89 - Last mean moves per episode: 10.87\n",
"3489995 timesteps\n",
"Best mean reward: 24.73 - Last mean reward per episode: 23.81 - Last mean moves per episode: 10.89\n",
"3499992 timesteps\n",
"Best mean reward: 24.73 - Last mean reward per episode: 23.70 - Last mean moves per episode: 10.91\n",
"3509998 timesteps\n",
"Best mean reward: 24.73 - Last mean reward per episode: 23.79 - Last mean moves per episode: 10.90\n",
"3519989 timesteps\n",
"Best mean reward: 24.73 - Last mean reward per episode: 23.97 - Last mean moves per episode: 10.88\n",
"3529998 timesteps\n",
"Best mean reward: 24.73 - Last mean reward per episode: 24.12 - Last mean moves per episode: 10.86\n",
"3539990 timesteps\n",
"Best mean reward: 24.73 - Last mean reward per episode: 24.19 - Last mean moves per episode: 10.88\n",
"3549991 timesteps\n",
"Best mean reward: 24.73 - Last mean reward per episode: 24.86 - Last mean moves per episode: 10.81\n",
"Saving new best model\n",
"3559992 timesteps\n",
"Best mean reward: 24.86 - Last mean reward per episode: 25.08 - Last mean moves per episode: 10.82\n",
"Saving new best model\n",
"3569993 timesteps\n",
"Best mean reward: 25.08 - Last mean reward per episode: 24.92 - Last mean moves per episode: 10.85\n",
"3579983 timesteps\n",
"Best mean reward: 25.08 - Last mean reward per episode: 25.41 - Last mean moves per episode: 10.78\n",
"Saving new best model\n",
"3589990 timesteps\n",
"Best mean reward: 25.41 - Last mean reward per episode: 25.58 - Last mean moves per episode: 10.76\n",
"Saving new best model\n",
"3599996 timesteps\n",
"Best mean reward: 25.58 - Last mean reward per episode: 25.46 - Last mean moves per episode: 10.81\n",
"3609977 timesteps\n",
"Best mean reward: 25.58 - Last mean reward per episode: 25.47 - Last mean moves per episode: 10.79\n",
"3619987 timesteps\n",
"Best mean reward: 25.58 - Last mean reward per episode: 25.44 - Last mean moves per episode: 10.79\n",
"3629992 timesteps\n",
"Best mean reward: 25.58 - Last mean reward per episode: 25.53 - Last mean moves per episode: 10.77\n",
"3639997 timesteps\n",
"Best mean reward: 25.58 - Last mean reward per episode: 25.77 - Last mean moves per episode: 10.74\n",
"Saving new best model\n",
"3649998 timesteps\n",
"Best mean reward: 25.77 - Last mean reward per episode: 25.80 - Last mean moves per episode: 10.75\n",
"Saving new best model\n",
"3659971 timesteps\n",
"Best mean reward: 25.80 - Last mean reward per episode: 25.91 - Last mean moves per episode: 10.74\n",
"Saving new best model\n",
"3669987 timesteps\n",
"Best mean reward: 25.91 - Last mean reward per episode: 26.14 - Last mean moves per episode: 10.69\n",
"Saving new best model\n",
"3679988 timesteps\n",
"Best mean reward: 26.14 - Last mean reward per episode: 26.37 - Last mean moves per episode: 10.66\n",
"Saving new best model\n",
"3689977 timesteps\n",
"Best mean reward: 26.37 - Last mean reward per episode: 26.46 - Last mean moves per episode: 10.65\n",
"Saving new best model\n",
"3699988 timesteps\n",
"Best mean reward: 26.46 - Last mean reward per episode: 27.09 - Last mean moves per episode: 10.55\n",
"Saving new best model\n",
"3709983 timesteps\n",
"Best mean reward: 27.09 - Last mean reward per episode: 27.57 - Last mean moves per episode: 10.49\n",
"Saving new best model\n",
"3719998 timesteps\n",
"Best mean reward: 27.57 - Last mean reward per episode: 27.94 - Last mean moves per episode: 10.42\n",
"Saving new best model\n",
"3729997 timesteps\n",
"Best mean reward: 27.94 - Last mean reward per episode: 28.14 - Last mean moves per episode: 10.42\n",
"Saving new best model\n",
"3739996 timesteps\n",
"Best mean reward: 28.14 - Last mean reward per episode: 28.10 - Last mean moves per episode: 10.45\n",
"3749991 timesteps\n",
"Best mean reward: 28.14 - Last mean reward per episode: 28.38 - Last mean moves per episode: 10.41\n",
"Saving new best model\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"3759989 timesteps\n",
"Best mean reward: 28.38 - Last mean reward per episode: 28.57 - Last mean moves per episode: 10.36\n",
"Saving new best model\n",
"3769991 timesteps\n",
"Best mean reward: 28.57 - Last mean reward per episode: 28.73 - Last mean moves per episode: 10.33\n",
"Saving new best model\n",
"3779991 timesteps\n",
"Best mean reward: 28.73 - Last mean reward per episode: 28.68 - Last mean moves per episode: 10.33\n",
"3789994 timesteps\n",
"Best mean reward: 28.73 - Last mean reward per episode: 28.73 - Last mean moves per episode: 10.30\n",
"Saving new best model\n",
"3800000 timesteps\n",
"Best mean reward: 28.73 - Last mean reward per episode: 28.51 - Last mean moves per episode: 10.31\n",
"3810000 timesteps\n",
"Best mean reward: 28.73 - Last mean reward per episode: 28.55 - Last mean moves per episode: 10.33\n",
"3819994 timesteps\n",
"Best mean reward: 28.73 - Last mean reward per episode: 28.64 - Last mean moves per episode: 10.32\n",
"3829996 timesteps\n",
"Best mean reward: 28.73 - Last mean reward per episode: 28.57 - Last mean moves per episode: 10.33\n",
"3839996 timesteps\n",
"Best mean reward: 28.73 - Last mean reward per episode: 28.24 - Last mean moves per episode: 10.37\n",
"3850000 timesteps\n",
"Best mean reward: 28.73 - Last mean reward per episode: 28.21 - Last mean moves per episode: 10.38\n",
"3859999 timesteps\n",
"Best mean reward: 28.73 - Last mean reward per episode: 28.06 - Last mean moves per episode: 10.41\n",
"3869978 timesteps\n",
"Best mean reward: 28.73 - Last mean reward per episode: 28.39 - Last mean moves per episode: 10.38\n",
"3880000 timesteps\n",
"Best mean reward: 28.73 - Last mean reward per episode: 27.91 - Last mean moves per episode: 10.45\n",
"3889971 timesteps\n",
"Best mean reward: 28.73 - Last mean reward per episode: 27.86 - Last mean moves per episode: 10.46\n",
"3899989 timesteps\n",
"Best mean reward: 28.73 - Last mean reward per episode: 28.35 - Last mean moves per episode: 10.41\n",
"3909978 timesteps\n",
"Best mean reward: 28.73 - Last mean reward per episode: 28.34 - Last mean moves per episode: 10.44\n",
"3919995 timesteps\n",
"Best mean reward: 28.73 - Last mean reward per episode: 28.46 - Last mean moves per episode: 10.41\n",
"3929991 timesteps\n",
"Best mean reward: 28.73 - Last mean reward per episode: 28.38 - Last mean moves per episode: 10.46\n",
"3939998 timesteps\n",
"Best mean reward: 28.73 - Last mean reward per episode: 28.72 - Last mean moves per episode: 10.42\n",
"3949999 timesteps\n",
"Best mean reward: 28.73 - Last mean reward per episode: 28.85 - Last mean moves per episode: 10.39\n",
"Saving new best model\n",
"3959996 timesteps\n",
"Best mean reward: 28.85 - Last mean reward per episode: 29.03 - Last mean moves per episode: 10.36\n",
"Saving new best model\n",
"3969994 timesteps\n",
"Best mean reward: 29.03 - Last mean reward per episode: 28.97 - Last mean moves per episode: 10.39\n",
"3980000 timesteps\n",
"Best mean reward: 29.03 - Last mean reward per episode: 29.20 - Last mean moves per episode: 10.36\n",
"Saving new best model\n",
"3989994 timesteps\n",
"Best mean reward: 29.20 - Last mean reward per episode: 29.81 - Last mean moves per episode: 10.32\n",
"Saving new best model\n",
"3999984 timesteps\n",
"Best mean reward: 29.81 - Last mean reward per episode: 29.79 - Last mean moves per episode: 10.31\n",
"4009998 timesteps\n",
"Best mean reward: 29.81 - Last mean reward per episode: 29.85 - Last mean moves per episode: 10.27\n",
"Saving new best model\n",
"4020000 timesteps\n",
"Best mean reward: 29.85 - Last mean reward per episode: 29.58 - Last mean moves per episode: 10.30\n",
"4029993 timesteps\n",
"Best mean reward: 29.85 - Last mean reward per episode: 29.55 - Last mean moves per episode: 10.28\n",
"4039996 timesteps\n",
"Best mean reward: 29.85 - Last mean reward per episode: 29.67 - Last mean moves per episode: 10.23\n",
"4049995 timesteps\n",
"Best mean reward: 29.85 - Last mean reward per episode: 29.61 - Last mean moves per episode: 10.23\n",
"4059997 timesteps\n",
"Best mean reward: 29.85 - Last mean reward per episode: 29.64 - Last mean moves per episode: 10.24\n",
"4069992 timesteps\n",
"Best mean reward: 29.85 - Last mean reward per episode: 29.69 - Last mean moves per episode: 10.22\n",
"4079999 timesteps\n",
"Best mean reward: 29.85 - Last mean reward per episode: 29.62 - Last mean moves per episode: 10.23\n",
"4089992 timesteps\n",
"Best mean reward: 29.85 - Last mean reward per episode: 29.67 - Last mean moves per episode: 10.22\n",
"4099995 timesteps\n",
"Best mean reward: 29.85 - Last mean reward per episode: 29.61 - Last mean moves per episode: 10.22\n",
"4109996 timesteps\n",
"Best mean reward: 29.85 - Last mean reward per episode: 29.80 - Last mean moves per episode: 10.22\n",
"4119992 timesteps\n",
"Best mean reward: 29.85 - Last mean reward per episode: 30.13 - Last mean moves per episode: 10.17\n",
"Saving new best model\n",
"4129999 timesteps\n",
"Best mean reward: 30.13 - Last mean reward per episode: 30.36 - Last mean moves per episode: 10.12\n",
"Saving new best model\n",
"4140000 timesteps\n",
"Best mean reward: 30.36 - Last mean reward per episode: 30.47 - Last mean moves per episode: 10.14\n",
"Saving new best model\n",
"4150000 timesteps\n",
"Best mean reward: 30.47 - Last mean reward per episode: 30.65 - Last mean moves per episode: 10.14\n",
"Saving new best model\n",
"4159995 timesteps\n",
"Best mean reward: 30.65 - Last mean reward per episode: 30.93 - Last mean moves per episode: 10.11\n",
"Saving new best model\n",
"4169998 timesteps\n",
"Best mean reward: 30.93 - Last mean reward per episode: 30.97 - Last mean moves per episode: 10.13\n",
"Saving new best model\n",
"4179996 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 30.80 - Last mean moves per episode: 10.13\n",
"4190000 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 30.75 - Last mean moves per episode: 10.14\n",
"4200000 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 30.76 - Last mean moves per episode: 10.15\n",
"4209999 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 30.83 - Last mean moves per episode: 10.13\n",
"4219995 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 30.83 - Last mean moves per episode: 10.15\n",
"4229996 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 30.50 - Last mean moves per episode: 10.23\n",
"4239990 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 30.47 - Last mean moves per episode: 10.21\n",
"4249998 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 30.34 - Last mean moves per episode: 10.23\n",
"4259997 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 30.31 - Last mean moves per episode: 10.22\n",
"4269995 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 30.39 - Last mean moves per episode: 10.20\n",
"4279995 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 30.55 - Last mean moves per episode: 10.19\n",
"4290000 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 30.53 - Last mean moves per episode: 10.20\n",
"4299997 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 30.45 - Last mean moves per episode: 10.21\n",
"4309998 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 30.38 - Last mean moves per episode: 10.22\n",
"4320000 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 30.31 - Last mean moves per episode: 10.23\n",
"4329990 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 30.05 - Last mean moves per episode: 10.26\n",
"4339999 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 29.25 - Last mean moves per episode: 10.34\n",
"4349997 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 29.33 - Last mean moves per episode: 10.32\n",
"4359999 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 28.74 - Last mean moves per episode: 10.39\n",
"4369997 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 27.68 - Last mean moves per episode: 10.50\n",
"4379997 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 26.90 - Last mean moves per episode: 10.58\n",
"4389990 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 26.87 - Last mean moves per episode: 10.56\n",
"4399999 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 26.96 - Last mean moves per episode: 10.59\n",
"4409999 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 26.97 - Last mean moves per episode: 10.60\n",
"4420000 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 27.06 - Last mean moves per episode: 10.57\n",
"4429985 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 27.04 - Last mean moves per episode: 10.57\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"4440000 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 28.00 - Last mean moves per episode: 10.45\n",
"4449977 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 28.35 - Last mean moves per episode: 10.43\n",
"4459996 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 28.68 - Last mean moves per episode: 10.37\n",
"4469996 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 29.62 - Last mean moves per episode: 10.29\n",
"4479988 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 30.64 - Last mean moves per episode: 10.17\n",
"4490000 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 30.90 - Last mean moves per episode: 10.17\n",
"4499999 timesteps\n",
"Best mean reward: 30.97 - Last mean reward per episode: 31.12 - Last mean moves per episode: 10.11\n",
"Saving new best model\n",
"4509992 timesteps\n",
"Best mean reward: 31.12 - Last mean reward per episode: 31.14 - Last mean moves per episode: 10.09\n",
"Saving new best model\n",
"4519999 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 30.97 - Last mean moves per episode: 10.13\n",
"4529997 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 31.06 - Last mean moves per episode: 10.11\n",
"4540000 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 30.81 - Last mean moves per episode: 10.17\n",
"4549996 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 30.75 - Last mean moves per episode: 10.19\n",
"4559994 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 31.05 - Last mean moves per episode: 10.15\n",
"4570000 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 30.41 - Last mean moves per episode: 10.23\n",
"4579993 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 29.68 - Last mean moves per episode: 10.33\n",
"4589998 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 29.35 - Last mean moves per episode: 10.37\n",
"4600000 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 29.21 - Last mean moves per episode: 10.35\n",
"4609989 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 28.30 - Last mean moves per episode: 10.48\n",
"4619992 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 28.46 - Last mean moves per episode: 10.46\n",
"4629994 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 28.20 - Last mean moves per episode: 10.50\n",
"4639994 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 28.12 - Last mean moves per episode: 10.52\n",
"4649998 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 27.95 - Last mean moves per episode: 10.54\n",
"4659978 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 28.02 - Last mean moves per episode: 10.54\n",
"4669991 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 27.99 - Last mean moves per episode: 10.55\n",
"4679994 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 29.12 - Last mean moves per episode: 10.41\n",
"4689992 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 29.50 - Last mean moves per episode: 10.34\n",
"4699997 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 29.73 - Last mean moves per episode: 10.34\n",
"4709989 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 30.91 - Last mean moves per episode: 10.16\n",
"4719999 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 30.93 - Last mean moves per episode: 10.19\n",
"4730000 timesteps\n",
"Best mean reward: 31.14 - Last mean reward per episode: 31.48 - Last mean moves per episode: 10.12\n",
"Saving new best model\n",
"4739997 timesteps\n",
"Best mean reward: 31.48 - Last mean reward per episode: 31.72 - Last mean moves per episode: 10.09\n",
"Saving new best model\n",
"4749985 timesteps\n",
"Best mean reward: 31.72 - Last mean reward per episode: 31.90 - Last mean moves per episode: 10.09\n",
"Saving new best model\n",
"4760000 timesteps\n",
"Best mean reward: 31.90 - Last mean reward per episode: 32.03 - Last mean moves per episode: 10.06\n",
"Saving new best model\n",
"4770000 timesteps\n",
"Best mean reward: 32.03 - Last mean reward per episode: 32.21 - Last mean moves per episode: 10.07\n",
"Saving new best model\n",
"4779984 timesteps\n",
"Best mean reward: 32.21 - Last mean reward per episode: 32.49 - Last mean moves per episode: 10.03\n",
"Saving new best model\n",
"4789994 timesteps\n",
"Best mean reward: 32.49 - Last mean reward per episode: 32.55 - Last mean moves per episode: 10.04\n",
"Saving new best model\n",
"4799989 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 32.55 - Last mean moves per episode: 10.05\n",
"4809991 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 32.41 - Last mean moves per episode: 10.11\n",
"4819996 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 32.37 - Last mean moves per episode: 10.11\n",
"4829995 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 32.11 - Last mean moves per episode: 10.14\n",
"4839997 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 31.91 - Last mean moves per episode: 10.17\n",
"4849999 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 31.97 - Last mean moves per episode: 10.14\n",
"4859999 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 31.86 - Last mean moves per episode: 10.15\n",
"4869987 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 31.81 - Last mean moves per episode: 10.13\n",
"4879995 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 31.67 - Last mean moves per episode: 10.13\n",
"4889975 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 31.58 - Last mean moves per episode: 10.14\n",
"4899995 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 31.52 - Last mean moves per episode: 10.13\n",
"4909994 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 31.54 - Last mean moves per episode: 10.11\n",
"4919990 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 31.71 - Last mean moves per episode: 10.07\n",
"4929993 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 31.98 - Last mean moves per episode: 10.04\n",
"4939995 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 32.25 - Last mean moves per episode: 9.99\n",
"4949986 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 31.44 - Last mean moves per episode: 10.06\n",
"4959993 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 30.47 - Last mean moves per episode: 10.18\n",
"4969993 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 29.53 - Last mean moves per episode: 10.32\n",
"4979998 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 28.58 - Last mean moves per episode: 10.43\n",
"4989997 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 27.57 - Last mean moves per episode: 10.60\n",
"4999991 timesteps\n",
"Best mean reward: 32.55 - Last mean reward per episode: 26.83 - Last mean moves per episode: 10.69\n"
]
}
],
"source": [
"clear_session()\n",
"\n",
"# ships -- keep only one kind for 5x5 grid\n",
"ships = {}\n",
"ships['cruiser'] = 3\n",
"\n",
"grid_size = 6\n",
"num_timesteps = 5000000 # this is number of moves and not number of episodes\n",
"\n",
"best_mean_reward, n_steps, step_interval, episode_interval = -np.inf, 0, 10000, 10000\n",
"\n",
"# Instantiate the env\n",
"env = BattleshipEnv(enemy_board=None, ship_locs={}, grid_size=grid_size, ships=ships)\n",
"\n",
"\n",
"# wrap it\n",
"log_dir = \"./gym/\"\n",
"os.makedirs(log_dir, exist_ok=True)\n",
"env = Monitor(env, filename=log_dir, allow_early_resets=True)\n",
"env = DummyVecEnv([lambda: env])\n",
"\n",
"model = A2C('MlpPolicy', env, verbose=0, #learning_rate=0.00007,\n",
" ).learn(total_timesteps=num_timesteps, callback=callback)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# save manually \n",
"#model.save(log_dir + 'best_model_cruiser_10x10.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plot_results(log_dir,1000)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# keep learning using a previously trained model\n",
"#model = A2C.load('./gym/best_model_cruiser_7x7.pkl')\n",
"\n",
"# ships -- keep only one kind for 5x5 grid\n",
"#ships = {}\n",
"#ships['cruiser'] = 3\n",
"\n",
"#grid_size = 7\n",
"#num_timesteps = 10000000 # this is number of moves and not number of episodes\n",
"\n",
"#best_mean_reward, n_steps, step_interval, episode_interval = -np.inf, 0, 10000, 1000\n",
"\n",
"# Instantiate the env\n",
"#env = BattleshipEnv(enemy_board=None, ship_locs={}, grid_size=grid_size, ships=ships)\n",
"\n",
"# wrap it\n",
"#log_dir = \"./gym/\"\n",
"#os.makedirs(log_dir, exist_ok=True)\n",
"#env = Monitor(env, filename=log_dir, allow_early_resets=True)\n",
"#env = DummyVecEnv([lambda: env])\n",
"\n",
"#model.set_env(env)\n",
"\n",
"#model.learn(total_timesteps=num_timesteps, callback=callback)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Visualizing How the Agent Plays"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#model_best = A2C.load('./gym/best_model_cruiser_5x5.pkl')\n",
"#model_best = A2C.load('./gym/best_model_cruiser_6x6.pkl')\n",
"model_best = A2C.load('./gym/best_model_cruiser_7x7.pkl')\n",
"#model_best = A2C.load('./gym/best_model_cruiser_10x10.pkl')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# brew install ffmpeg\n",
"# brew install gifsicle\n",
"# Shift + Command + 5 for recording. This saves .mov file\n",
"# right-click on mov file, get info for video size to use here below\n",
"# ffmpeg -i in.mov -s 448x790 -pix_fmt rgb24 -r 10 -f gif - | gifsicle --optimize=3 --delay=3 > out.gif\n",
"\n",
"from IPython.display import clear_output\n",
"import time\n",
"\n",
"ships = {}\n",
"ships['cruiser'] = 3\n",
"\n",
"grid_size=7\n",
"enemy_board = 0*np.ones((grid_size, grid_size), dtype='int')\n",
"#enemy_board[3,5] = 1\n",
"#enemy_board[4,5] = 1\n",
"#enemy_board[5,5] = 1\n",
"env = BattleshipEnv(enemy_board=None, ship_locs={}, grid_size=grid_size, ships=ships)\n",
"# give me time to setup recording\n",
"time.sleep(5)\n",
"for ep in range(10):\n",
" obs = env.reset()\n",
" ## 2 empty boards\n",
" done = False\n",
" nmoves = 0\n",
" print('episode no.', ep, '# moves:', nmoves)\n",
" env.render()\n",
" env.render()\n",
" time.sleep(5)\n",
" clear_output(wait=True) \n",
" while not done:\n",
" action, obs = model_best.predict(obs, deterministic=True)\n",
" obs, _, done , _ = env.step(action)\n",
" nmoves += 1\n",
" print('episode no.', ep, '# moves:', nmoves)\n",
" env.render()\n",
" board_rendering(grid_size, env.enemy_board)\n",
" time.sleep(np.random.uniform(1,3))\n",
" clear_output(wait=True) \n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Optimizing The Algorithm Parameters with Hyperopt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"## To optimize a RL model, see https://github.com/araffin/rl-baselines-zoo/tree/master/hyperparams or\n",
"## in general https://github.com/araffin/rl-baselines-zoo. This package uses optuna optimization\n",
"## but it works for the trained agents there. You can modify this package to include your case\n",
"## or just use the yml file to see what parameters to tune\n",
"\n",
"from hyperopt import hp, fmin, tpe, STATUS_OK, Trials, space_eval\n",
"from stable_baselines.common.vec_env import DummyVecEnv\n",
"from stable_baselines import DQN, PPO2, A2C, ACKTR\n",
"from stable_baselines.bench import Monitor\n",
"\n",
"# Agent hyperparameter optimization\n",
"def objective(space):\n",
" \n",
"\n",
" env_copies = space['env_copies'] \n",
" num_timesteps = space['num_timesteps']\n",
" gamma = space['gamma']\n",
" n_steps = space['n_steps']\n",
" vf_coef = space['vf_coef']\n",
" ent_coef = space['ent_coef']\n",
" max_grad_norm = space['max_grad_norm']\n",
" learning_rate = space['learning_rate']\n",
" alpha = space['alpha']\n",
" epsilon = space['epsilon']\n",
" lr_schedule = space['lr_schedule']\n",
" \n",
" print('space:', space)\n",
" \n",
" # ships\n",
" ships = {}\n",
" ships['cruiser'] = 3\n",
"\n",
" grid_size = 7\n",
"\n",
" # Instantiate the env\n",
" env = BattleshipEnv(enemy_board=None, ship_locs={}, grid_size=grid_size, ships=ships)\n",
"\n",
" env = DummyVecEnv([lambda: env]*env_copies)\n",
" \n",
" model = A2C('MlpPolicy', env, verbose=0, \n",
" gamma=gamma,\n",
" n_steps=n_steps,\n",
" ent_coef=ent_coef,\n",
" learning_rate=learning_rate,\n",
" vf_coef=vf_coef,\n",
" max_grad_norm=max_grad_norm,\n",
" alpha=alpha,\n",
" epsilon=epsilon,\n",
" lr_schedule=lr_schedule\n",
" ).learn(total_timesteps=num_timesteps)\n",
" \n",
" rewards_mean = []\n",
" moves_mean = []\n",
" n_episodes = 100\n",
" for ep in range(n_episodes):\n",
" reward_env = []\n",
" moves_env = []\n",
" for env_i in env.envs:\n",
" obs = env_i.reset()\n",
" done = False\n",
" rewards_sum = 0\n",
" moves = 0\n",
" while not done:\n",
" action, obs = model.predict(obs, deterministic=True)\n",
" obs, reward, done , _ = env_i.step(action)\n",
" rewards_sum += reward # total reward for this episode\n",
" moves += 1\n",
" reward_env.append(rewards_sum)\n",
" moves_env.append(moves)\n",
" rewards_mean.append(np.min(reward_env)) # avg environment reward \n",
" moves_mean.append(np.mean(moves_env)) # avg environment reward \n",
" rewards_mean = np.mean(rewards_mean)\n",
" moves_mean = np.mean(moves_mean)\n",
"\n",
" print('reward', rewards_mean, 'moves', moves_mean)\n",
" \n",
" # hyperopt will minimize objective, number of moves in this case\n",
" return{'loss': moves_mean, 'status': STATUS_OK }"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"space = {\n",
" 'env_copies': hp.choice('env_copies', [10]),\n",
" 'num_timesteps': hp.choice('num_timesteps', [1000000]), #np.arange(1000000, 1000001, 1000000, dtype=int)\n",
" 'gamma': hp.choice('gamma', [0.99, 0.95, 0.9]),\n",
" 'n_steps': hp.choice('n_steps', [5, 1, 10]),\n",
" 'vf_coef': hp.choice('vf_coef', [0.25, 0.1, 0.5]),\n",
" 'ent_coef': hp.choice('ent_coef', [0.01, 0.1]), \n",
" 'learning_rate': hp.choice('learning_rate', [0.0007]),\n",
" 'max_grad_norm': hp.choice('max_grad_norm', [0.5, 0.2, 0.7]), \n",
" 'alpha': hp.choice('lam', [0.99, 0.95, 0.9]), \n",
" 'epsilon': hp.choice('epsilon', [1e-5, 1e-3, 1e-4]), \n",
" 'lr_schedule': hp.choice('lr_schedule', ['constant', 'linear'])\n",
"}\n",
"\n",
"\n",
"trials = Trials()\n",
"best = fmin(fn=objective,\n",
" space=space,\n",
" algo=tpe.suggest,\n",
" max_evals=30, \n",
" trials=trials, verbose=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#%debug"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"param_dist = space_eval(space, best)\n",
"param_dist"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Links\n",
"\n",
"https://colab.research.google.com/github/araffin/rl-tutorial-jnrr19/blob/master/5_custom_gym_env.ipynb#scrollTo=rYzDXA9vJfz1\n",
"\n",
"https://stable-baselines.readthedocs.io/en/master/guide/examples.html\n",
"\n",
"https://gym.openai.com/envs/#classic_control"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Reward scheme\n",
"\n",
"For any action: \n",
"$$r=-1,$$ \n",
"but if an action is illegal (moving to a non-empty cell), a random action is drawn from the action space. \n",
"\n",
"This action is penalized assigning:\n",
"\n",
"$$r=-2*S.$$\n",
"\n",
"where $S$ is the grid side length.\n",
"\n",
"If an action results into a hit:\n",
"$$\n",
"r = S.\n",
"$$\n",
"If all ship cells are hit (game is completed)\n",
"$$R = S*S.$$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Skeleton Battleship Environmnt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class BattleshipEnv(gym.Env):\n",
" \n",
" \"\"\"Custom Environment that follows gym interface\"\"\"\n",
" \"\"\"see https://github.com/openai/gym/blob/master/gym/core.py\"\"\"\n",
" \n",
" metadata = {'render.modes': ['human']} \n",
"\n",
" def __init__(self, enemy_board, ship_locs, grid_size, ships):\n",
" \n",
" super(BattleshipEnv, self).__init__()\n",
" \n",
" # Define action and observation space\n",
" # They must be gym.spaces objects\n",
" # In our case the action space is discrete: index of action\n",
" self.action_space = spaces.Discrete(self.grid_size * self.grid_size)\n",
" # The observation will be the state or configuration of the board\n",
" self.observation_space = spaces.Box(low=-1, high=1,shape=(self.grid_size, self.grid_size), \n",
" dtype=np.int)\n",
" \n",
" pass\n",
" \n",
" # an action will be an index of action_space either from epsilon-greedy\n",
" # or from model prediction\n",
" def step(self, action):\n",
" \n",
" \"\"\"\n",
" Rewards for action and sets next state\n",
" Also, checks if game is completed (done)\n",
" :return: next_state, reward, done, info\n",
" \"\"\"\n",
" \n",
" pass\n",
" \n",
" def reset(self):\n",
" \"\"\"\n",
" Resets the state of the environment to an initial state\n",
" :return: (np.array) state\n",
" \"\"\"\n",
" \n",
" pass\n",
" \n",
" def render(self, mode='human'):\n",
" \"\"\"\n",
" Human readable state. In this case the scoring board\n",
" \"\"\"\n",
" \n",
" pass"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": true,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": true
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment