pilipolio/DoomBasic.ipynb

## DoomBasic.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import gym\n",
    "import ppaquette_gym_doom\n",
    "from ppaquette_gym_doom.wrappers import SetResolution, ToDiscrete\n",
    "from gym.wrappers import SkipWrapper\n",
    "from gym import wrappers\n",
    "\n",
    "# (see https://github.com/ppaquette/gym-doom/blob/master/ppaquette_gym_doom/doom_basic.py)\n",
    "def create_env(seed=None):\n",
    "    env_spec = gym.spec('ppaquette/DoomBasic-v0')\n",
    "    env_spec.id = 'DoomBasic-v0'\n",
    "    env = env_spec.make()\n",
    "\n",
    "    if seed is not None:\n",
    "        env.seed(seed)\n",
    "\n",
    "    return SetResolution('200x150')(\n",
    "        SkipWrapper(repeat_count=4)(\n",
    "        ToDiscrete('minimal')(env)))\n",
    "\n",
    "env = create_env()\n",
    "WIDTH, HEIGHT = env.screen_width, env.screen_height\n",
    "\n",
    "NOOP, SHOOT, RIGHT, LEFT = 0, 1, 2, 3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Collecting experiences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from collections import namedtuple\n",
    "import operator\n",
    "\n",
    "SARE = namedtuple('SAR', ['state', 'action', 'reward', 'end'])\n",
    "\n",
    "\n",
    "def generate_sares(env, agent, episode_count=100):\n",
    "    reward = 0\n",
    "    done = False\n",
    "\n",
    "    for i in range(episode_count):\n",
    "        observation = env.reset()\n",
    "        while True:\n",
    "            action = agent.act(observation, reward, done)\n",
    "            new_observation, reward, done, _ = env.step(action)\n",
    "            yield SARE(observation, action, reward, done)\n",
    "            \n",
    "            if done:\n",
    "                break\n",
    "            else:\n",
    "                observation = new_observation\n",
    "\n",
    "def episode_sares(env, agent, episode_count=100):\n",
    "    sares = list(generate_sares(env, agent, episode_count))\n",
    "    print('average reward per episode = {}'.format(\n",
    "        sum(r for _, _, r, _ in sares) / float(sum(e for _, _, _, e in sares))))\n",
    "    return sares\n",
    "\n",
    "        \n",
    "def to_experiences(sares, only_n_misses=100):\n",
    "    experiences = [\n",
    "        (previous_s, a, r, next_s, end)\n",
    "        for (previous_s, a, r, end), (next_s, _, _, _) in zip(sares[:-1], sares[1:])\n",
    "    ]\n",
    "\n",
    "    # simplistic experience prioritization\n",
    "    shuffled_exps = experiences if only_n_misses is None\\\n",
    "        else random.choices(experiences, k=only_n_misses) + [e for e in experiences if e[2] > 0]\n",
    "    random.shuffle(shuffled_exps)\n",
    "\n",
    "    prev_frames, actions, rewards, next_frames, is_ends = zip(*shuffled_exps)\n",
    "    prev_frames = np.asarray(prev_frames)\n",
    "    next_frames = np.asarray(next_frames)\n",
    "    actions = np.asarray(actions)\n",
    "    rewards = np.asarray(rewards)\n",
    "    is_ends = np.asarray(is_ends)\n",
    "    \n",
    "    print('Training on {}/{} positive/total out of {} 1-step experiences with actions distribution {}'.format(\n",
    "        np.sum(rewards>=0),\n",
    "        len(rewards),\n",
    "        len(experiences),\n",
    "        np.bincount(actions)))\n",
    "    \n",
    "    return (prev_frames, next_frames, actions, rewards, is_ends)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Deep Q-learning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "from keras import backend as K\n",
    "\n",
    "from keras.layers import Dense, Convolution2D, Flatten, Activation\n",
    "from keras.models import Sequential\n",
    "from keras.optimizers import Adam\n",
    "\n",
    "sess = tf.InteractiveSession()\n",
    "K.set_session(sess)\n",
    "\n",
    "def create_q_model(conv1_weights=None, conv2_weights=None, dense1_weights=None, dense2_weights=None):\n",
    "    model = Sequential()\n",
    "\n",
    "    model.add(Convolution2D(\n",
    "        2, nb_row=6, nb_col=6, border_mode='valid', weights=conv1_weights,\n",
    "        input_shape=[HEIGHT, WIDTH, 3], dim_ordering='tf'))\n",
    "    model.add(Activation('relu'))\n",
    "    model.add(Convolution2D(4, nb_row=2, nb_col=2, weights=conv2_weights))\n",
    "    model.add(Activation('relu'))\n",
    "    model.add(Flatten())\n",
    "    model.add(Dense(64, init='normal', weights=dense1_weights))\n",
    "    model.add(Activation('relu'))\n",
    "    model.add(Dense(4, init='normal', weights=dense2_weights))\n",
    "    model.compile(loss='mse', optimizer=Adam())\n",
    "    \n",
    "    return model\n",
    "\n",
    "acting_model = create_q_model()\n",
    "target_model = create_q_model()\n",
    "\n",
    "def copy_model(model):\n",
    "    conv1_weights =  [w.eval() for w in model.layers[0].weights]\n",
    "    conv2_weights = [w.eval() for w in model.layers[2].weights]\n",
    "    dense1_weights = [w.eval() for w in model.layers[5].weights]\n",
    "    dense2_weights = [w.eval() for w in model.layers[7].weights]\n",
    "    return create_q_model(conv1_weights, conv2_weights, dense1_weights, dense2_weights)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def sares_to_input_targets(model, sares, gamma = .99, reward_clip=5, only_n_misses=100):\n",
    "    \n",
    "    prev_frames, next_frames, actions, rewards, is_ends = to_experiences(sares, only_n_misses)\n",
    "    \n",
    "    n_samples = len(actions)\n",
    "    clipped_rewards = np.clip(rewards, -np.inf, reward_clip)\n",
    "    \n",
    "    # Transcription of the Q-learning target formula\n",
    "    targets = clipped_rewards + gamma * (1 - is_ends) * model.predict(next_frames).max(axis=1)\n",
    "\n",
    "    target_action_rewards = model.predict(prev_frames)\n",
    "    target_action_rewards[np.arange(n_samples), actions] = targets\n",
    "\n",
    "    return prev_frames, target_action_rewards"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[2017-03-04 11:44:01,389] DEPRECATION WARNING: env.spec.timestep_limit has been deprecated. Replace your call to `env.spec.timestep_limit` with `env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')`. This change was made 12/28/2016 and is included in version 0.7.0\n",
      "[2017-03-04 11:44:01,390] Clearing 27 monitor files from previous run (because force=True was provided)\n",
      "[2017-03-04 11:44:01,764] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000000.mp4\n",
      "[2017-03-04 11:44:02,552] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000001.mp4\n",
      "[2017-03-04 11:44:05,313] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000008.mp4\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "average reward per episode = -295.0\n",
      "Training on 4/104 positive/total out of 499 1-step experiences with actions distribution [ 2 91  5  6]\n",
      "average reward per episode = -173.8\n",
      "Training on 6/106 positive/total out of 349 1-step experiences with actions distribution [ 4 96  5  1]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[2017-03-04 11:44:17,532] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000027.mp4\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "average reward per episode = -301.7\n",
      "Training on 3/103 positive/total out of 507 1-step experiences with actions distribution [ 4 95  1  3]\n",
      "average reward per episode = -186.5\n",
      "Training on 5/105 positive/total out of 369 1-step experiences with actions distribution [ 2 99  2  2]\n",
      "average reward per episode = -344.1\n",
      "Training on 3/103 positive/total out of 572 1-step experiences with actions distribution [ 3 95  2  3]\n",
      "average reward per episode = -283.2\n",
      "Training on 4/104 positive/total out of 480 1-step experiences with actions distribution [ 4 96  2  2]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[2017-03-04 11:44:42,014] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000064.mp4\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "average reward per episode = -212.8\n",
      "Training on 5/104 positive/total out of 393 1-step experiences with actions distribution [ 2 97  3  2]\n",
      "average reward per episode = -238.4\n",
      "Training on 6/105 positive/total out of 432 1-step experiences with actions distribution [ 4 96  1  4]\n",
      "average reward per episode = -282.7\n",
      "Training on 4/104 positive/total out of 478 1-step experiences with actions distribution [ 1 99  2  2]\n",
      "average reward per episode = -243.1\n",
      "Training on 5/105 positive/total out of 437 1-step experiences with actions distribution [  0 100   2   3]\n",
      "average reward per episode = -326.9\n",
      "Training on 4/103 positive/total out of 546 1-step experiences with actions distribution [ 1 98  2  2]\n",
      "average reward per episode = -173.7\n",
      "Training on 7/105 positive/total out of 349 1-step experiences with actions distribution [ 4 98  2  1]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[2017-03-04 11:45:20,178] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000125.mp4\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "average reward per episode = -328.8\n",
      "Training on 4/104 positive/total out of 547 1-step experiences with actions distribution [ 2 96  3  3]\n",
      "average reward per episode = -319.3\n",
      "Training on 4/103 positive/total out of 533 1-step experiences with actions distribution [ 1 97  2  3]\n",
      "average reward per episode = -284.6\n",
      "Training on 4/104 positive/total out of 483 1-step experiences with actions distribution [ 1 99  0  4]\n",
      "average reward per episode = -250.8\n",
      "Training on 5/104 positive/total out of 432 1-step experiences with actions distribution [ 2 99  2  1]\n",
      "average reward per episode = -362.6\n",
      "Training on 2/102 positive/total out of 567 1-step experiences with actions distribution [ 2 96  4]\n",
      "average reward per episode = -117.1\n",
      "Training on 7/107 positive/total out of 282 1-step experiences with actions distribution [ 0 98  2  7]\n",
      "average reward per episode = -343.6\n",
      "Training on 2/102 positive/total out of 555 1-step experiences with actions distribution [ 3 96  2  1]\n",
      "average reward per episode = -226.5\n",
      "Training on 5/105 positive/total out of 412 1-step experiences with actions distribution [ 2 98  2  3]\n",
      "average reward per episode = -333.1\n",
      "Training on 3/103 positive/total out of 540 1-step experiences with actions distribution [ 2 92  5  4]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[2017-03-04 11:46:19,153] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000216.mp4\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "average reward per episode = -250.3\n",
      "Training on 6/105 positive/total out of 447 1-step experiences with actions distribution [ 2 93  8  2]\n",
      "average reward per episode = -281.4\n",
      "Training on 3/103 positive/total out of 480 1-step experiences with actions distribution [ 6 93  3  1]\n",
      "average reward per episode = -297.9\n",
      "Training on 4/104 positive/total out of 502 1-step experiences with actions distribution [ 4 87 10  3]\n",
      "average reward per episode = -415.4\n",
      "Training on 1/101 positive/total out of 630 1-step experiences with actions distribution [ 1 92  7  1]\n",
      "average reward per episode = -394.4\n",
      "Training on 1/101 positive/total out of 615 1-step experiences with actions distribution [ 5 93  0  3]\n",
      "average reward per episode = -200.4\n",
      "Training on 7/105 positive/total out of 373 1-step experiences with actions distribution [ 5 91  3  6]\n",
      "average reward per episode = -87.6\n",
      "Training on 7/106 positive/total out of 237 1-step experiences with actions distribution [  4 101   0   1]\n",
      "average reward per episode = -255.3\n",
      "Training on 5/104 positive/total out of 440 1-step experiences with actions distribution [ 3 95  3  3]\n",
      "average reward per episode = -301.7\n",
      "Training on 3/102 positive/total out of 492 1-step experiences with actions distribution [ 0 98  2  2]\n",
      "average reward per episode = -170.0\n",
      "Training on 6/106 positive/total out of 342 1-step experiences with actions distribution [ 0 99  3  4]\n",
      "average reward per episode = -294.0\n",
      "Training on 4/104 positive/total out of 513 1-step experiences with actions distribution [ 3 93  4  4]\n",
      "average reward per episode = -343.5\n",
      "Training on 4/103 positive/total out of 557 1-step experiences with actions distribution [ 6 94  1  2]\n",
      "average reward per episode = -171.3\n",
      "Training on 8/105 positive/total out of 347 1-step experiences with actions distribution [ 5 91  6  3]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[2017-03-04 11:47:45,261] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000343.mp4\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "average reward per episode = -155.1\n",
      "Training on 11/107 positive/total out of 340 1-step experiences with actions distribution [  2 102   2   1]\n",
      "average reward per episode = -388.8\n",
      "Training on 2/101 positive/total out of 604 1-step experiences with actions distribution [ 2 95  2  2]\n",
      "average reward per episode = -403.0\n",
      "Training on 1/101 positive/total out of 628 1-step experiences with actions distribution [ 4 91  2  4]\n",
      "average reward per episode = -331.6\n",
      "Training on 2/102 positive/total out of 537 1-step experiences with actions distribution [ 2 97  1  2]\n",
      "average reward per episode = -425.8\n",
      "Training on 4/102 positive/total out of 661 1-step experiences with actions distribution [ 4 94  3  1]\n",
      "average reward per episode = -205.2\n",
      "Training on 8/106 positive/total out of 396 1-step experiences with actions distribution [ 5 98  1  2]\n",
      "average reward per episode = -157.5\n",
      "Training on 5/105 positive/total out of 324 1-step experiences with actions distribution [ 3 96  2  4]\n",
      "average reward per episode = -328.3\n",
      "Training on 4/103 positive/total out of 547 1-step experiences with actions distribution [ 2 95  2  4]\n",
      "average reward per episode = -378.3\n",
      "Training on 3/103 positive/total out of 608 1-step experiences with actions distribution [ 6 94  1  2]\n",
      "average reward per episode = -328.8\n",
      "Training on 4/104 positive/total out of 550 1-step experiences with actions distribution [ 3 97  3  1]\n",
      "average reward per episode = -111.7\n",
      "Training on 7/106 positive/total out of 273 1-step experiences with actions distribution [ 3 96  5  2]\n",
      "average reward per episode = -227.4\n",
      "Training on 5/105 positive/total out of 414 1-step experiences with actions distribution [ 0 96  4  5]\n",
      "average reward per episode = -102.0\n",
      "Training on 12/106 positive/total out of 258 1-step experiences with actions distribution [  2 100   4]\n",
      "average reward per episode = -255.5\n",
      "Training on 4/104 positive/total out of 440 1-step experiences with actions distribution [ 4 94  3  3]\n",
      "average reward per episode = -304.3\n",
      "Training on 4/104 positive/total out of 514 1-step experiences with actions distribution [ 5 95  0  4]\n",
      "average reward per episode = -330.7\n",
      "Training on 4/104 positive/total out of 550 1-step experiences with actions distribution [ 4 92  6  2]\n",
      "average reward per episode = -180.4\n",
      "Training on 6/105 positive/total out of 360 1-step experiences with actions distribution [ 1 98  2  4]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[2017-03-04 11:49:41,426] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000512.mp4\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "average reward per episode = -396.8\n",
      "Training on 4/103 positive/total out of 634 1-step experiences with actions distribution [  1 100   2]\n",
      "average reward per episode = -230.3\n",
      "Training on 6/104 positive/total out of 419 1-step experiences with actions distribution [ 1 97  4  2]\n",
      "average reward per episode = -331.2\n",
      "Training on 3/103 positive/total out of 536 1-step experiences with actions distribution [ 3 94  4  2]\n",
      "average reward per episode = -232.8\n",
      "Training on 5/104 positive/total out of 421 1-step experiences with actions distribution [ 2 99  3]\n",
      "average reward per episode = -93.8\n",
      "Training on 6/106 positive/total out of 247 1-step experiences with actions distribution [ 2 98  3  3]\n",
      "average reward per episode = -82.3\n",
      "Training on 13/107 positive/total out of 244 1-step experiences with actions distribution [  2 104   0   1]\n",
      "average reward per episode = -81.9\n",
      "Training on 8/107 positive/total out of 246 1-step experiences with actions distribution [  2 100   3   2]\n",
      "average reward per episode = -286.0\n",
      "Training on 3/103 positive/total out of 485 1-step experiences with actions distribution [ 1 98  2  2]\n",
      "average reward per episode = -206.0\n",
      "Training on 7/106 positive/total out of 398 1-step experiences with actions distribution [  0 102   1   3]\n",
      "average reward per episode = -149.5\n",
      "Training on 6/106 positive/total out of 330 1-step experiences with actions distribution [ 5 93  5  3]\n",
      "average reward per episode = -276.5\n",
      "Training on 5/104 positive/total out of 469 1-step experiences with actions distribution [  1 101   0   2]\n",
      "average reward per episode = -196.6\n",
      "Training on 7/105 positive/total out of 367 1-step experiences with actions distribution [ 3 98  1  3]\n",
      "average reward per episode = -192.5\n",
      "Training on 9/107 positive/total out of 395 1-step experiences with actions distribution [  2 100   3   2]\n",
      "average reward per episode = -286.9\n",
      "Training on 6/104 positive/total out of 486 1-step experiences with actions distribution [  2 101   1]\n",
      "average reward per episode = -303.1\n",
      "Training on 6/104 positive/total out of 511 1-step experiences with actions distribution [ 3 94  2  5]\n",
      "average reward per episode = -338.8\n",
      "Training on 5/103 positive/total out of 551 1-step experiences with actions distribution [ 2 95  4  2]\n",
      "average reward per episode = -229.3\n",
      "Training on 5/105 positive/total out of 433 1-step experiences with actions distribution [ 4 95  1  5]\n",
      "average reward per episode = -149.4\n",
      "Training on 12/106 positive/total out of 331 1-step experiences with actions distribution [ 3 96  1  6]\n",
      "average reward per episode = -216.5\n",
      "Training on 5/104 positive/total out of 398 1-step experiences with actions distribution [ 2 97  2  3]\n",
      "average reward per episode = -311.6\n",
      "Training on 4/104 positive/total out of 541 1-step experiences with actions distribution [ 4 94  4  2]\n",
      "average reward per episode = -309.3\n",
      "Training on 3/102 positive/total out of 504 1-step experiences with actions distribution [ 2 97  2  1]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[2017-03-04 11:52:07,127] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000729.mp4\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "average reward per episode = -343.0\n",
      "Training on 3/103 positive/total out of 555 1-step experiences with actions distribution [ 3 91  2  7]\n",
      "average reward per episode = -357.0\n",
      "Training on 0/100 positive/total out of 699 1-step experiences with actions distribution [93  2  3  2]\n",
      "average reward per episode = -316.3\n",
      "Training on 2/102 positive/total out of 659 1-step experiences with actions distribution [92  4  4  2]\n",
      "average reward per episode = -358.5\n",
      "Training on 0/100 positive/total out of 699 1-step experiences with actions distribution [95  0  1  4]\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "\n",
    "class EpsilonGreedyQAgent(object):\n",
    "    def __init__(self, model, epsilon=.1):\n",
    "        self.model = model\n",
    "        self.epsilon = epsilon\n",
    "\n",
    "    def act(self, observation, reward, done):\n",
    "        if random.uniform(0, 1) <= self.epsilon:\n",
    "            return random.choice([NOOP, SHOOT, LEFT, RIGHT])\n",
    "        else:\n",
    "            return self.model.predict(observation[np.newaxis])[0].argmax()\n",
    "\n",
    "\n",
    "N_BATCHES = 100\n",
    "N_BATCHED_EPISODES = 10\n",
    "UPDATE_TARGET_EVERY_N_BACTHES = 2\n",
    "MINI_BATCH_SIZE = 32\n",
    "REWARD_CLIP = 5\n",
    "ONLY_N_MISSES = 200\n",
    "\n",
    "env = create_env()\n",
    "env = wrappers.Monitor(env, directory='tmp/q_learning', force=True, mode='training')\n",
    "\n",
    "for _ in range(N_BATCHES):\n",
    "    for _ in range(UPDATE_TARGET_EVERY_N_BACTHES):\n",
    "        sares = episode_sares(env, EpsilonGreedyQAgent(acting_model, epsilon=.1), N_BATCHED_EPISODES)\n",
    "        prev_frames, target_action_rewards = sares_to_input_targets(target_model, sares, reward_clip=REWARD_CLIP, only_n_misses=ONLY_N_MISSES)\n",
    "        acting_model.fit(x=prev_frames, y=target_action_rewards, batch_size=MINI_BATCH_SIZE, nb_epoch=1, verbose=0)\n",
    "    \n",
    "    target_model = copy_model(acting_model)\n",
    "\n",
    "\n",
    "# final greedy episodes\n",
    "sares = episode_sares(env, EpsilonGreedyQAgent(acting_model, epsilon=0), episode_count=1000)\n",
    "\n",
    "plt.plot(np.cumsum(list(map(operator.attrgetter('reward'), sares))));\n",
    "plt.xlabel('steps'); plt.ylabel('Cumulated rewards');\n",
    "\n",
    "env.close()\n",
    "gym.upload('tmp/q_learning', api_key='sk_bNZUvCfkTfabQCoKoKbjFA')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import json\n",
    "model_name = 'DoomBasic-v0_res=200x150_skip=4_discrete=minimal_fc64'\n",
    "\n",
    "acting_model.save(model_name + '.h5')\n",
    "\n",
    "with open(model_name + '.json', 'w+') as f:\n",
    "    json.dump(acting_model.to_json(), f)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}