rlandingin/fast-multi-core-tpu-mnist-training-bug.ipynb

## fast-multi-core-tpu-mnist-training-bug.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "accelerator": "TPU",
    "colab": {
      "name": "fast-multi-core-tpu-mnist-training-bug.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "toc_visible": true,
      "machine_shape": "hm",
      "include_colab_link": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.7"
    },
    "varInspector": {
      "cols": {
        "lenName": 16,
        "lenType": 16,
        "lenVar": 40
      },
      "kernels_config": {
        "python": {
          "delete_cmd_postfix": "",
          "delete_cmd_prefix": "del ",
          "library": "var_list.py",
          "varRefreshCmd": "print(var_dic_list())"
        },
        "r": {
          "delete_cmd_postfix": ") ",
          "delete_cmd_prefix": "rm(",
          "library": "var_list.r",
          "varRefreshCmd": "cat(var_dic_list()) "
        }
      },
      "types_to_exclude": [
        "module",
        "function",
        "builtin_function_or_method",
        "instance",
        "_Feature"
      ],
      "window_display": false
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/rlandingin/714f637b927313d0d6898b721ff0bc35/fast-multi-core-tpu-mnist-training-bug.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "dn7xM36f7FxM",
        "outputId": "78a0d5a3-d055-4906-c985-74c05a23744d"
      },
      "source": [
        "!pip install -Uqq cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.7-cp36-cp36m-linux_x86_64.whl"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "\u001b[K     |████████████████████████████████| 133.6MB 27kB/s \n",
            "\u001b[K     |████████████████████████████████| 61kB 3.7MB/s \n",
            "\u001b[?25h"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "zK0jJDxT7FxP",
        "outputId": "2fc4e0c5-4845-49fa-f375-f0f807173f4d"
      },
      "source": [
        "# !pip install -Uqq git+https://github.com/fastai/fastai.git \n",
        "!pip install -Uqq fastai --upgrade"
      ],
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "\u001b[K     |████████████████████████████████| 194kB 6.6MB/s \n",
            "\u001b[K     |████████████████████████████████| 61kB 6.4MB/s \n",
            "\u001b[?25h"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "SRRtia2_7FxQ",
        "outputId": "4519728d-61ad-419d-a725-29d68bb22756"
      },
      "source": [
        "!pip install -Uqq git+https://github.com/butchland/fastai_xla_extensions.git --upgrade"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "  Building wheel for fastai-xla-extensions (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "MY92w9-m7FxQ",
        "outputId": "65708d5b-a8ad-43c1-e091-1aa44e1df834"
      },
      "source": [
        "!pip install -Uqq git+https://github.com/butchland/my_timesaver_utils.git"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "  Building wheel for my-timesaver-utils (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "KXmksi5i7FxR",
        "outputId": "10beaa3a-bbdd-4aea-9f43-143f18ec2620"
      },
      "source": [
        "!curl -s https://course19.fast.ai/setup/colab | bash"
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Updating fastai...\n",
            "Done.\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "epCpujMw7FxS",
        "outputId": "fd81577f-4f69-4864-a0ea-bf1a491e68e0"
      },
      "source": [
        "!pip freeze | grep torch\n",
        "!pip freeze | grep fast"
      ],
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "torch==1.7.0+cu101\n",
            "torch-xla==1.7\n",
            "torchsummary==1.5.1\n",
            "torchtext==0.3.1\n",
            "torchvision==0.8.1+cu101\n",
            "fastai==2.2.5\n",
            "fastai-xla-extensions==0.0.8\n",
            "fastcore==1.3.19\n",
            "fastdtw==0.3.4\n",
            "fastprogress==1.0.0\n",
            "fastrlock==0.5\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "GBSZSW_rcojV",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "6df3c7ff-18a7-43e7-e14b-371758a5ad93"
      },
      "source": [
        "from fastai.vision.all import *\n",
        "from fastai_xla_extensions.all import *\n",
        "# import torch_xla.core.xla_model as xm\n",
        "# import torch_xla"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "WARNING:root:TPU has started up successfully with version pytorch-1.7\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "d_D86ji8TR7t"
      },
      "source": [
        "path = untar_data(URLs.MNIST)\n",
        "# path = untar_data(URLs.MNIST_TINY)"
      ],
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "m-8sge7vTXX-"
      },
      "source": [
        "data = DataBlock(\n",
        "    blocks=(ImageBlock, CategoryBlock),\n",
        "    get_items=get_image_files,\n",
        "    get_y=parent_label,\n",
        "    splitter=GrandparentSplitter(valid_name='testing', train_name='training'),\n",
        "    item_tfms=Resize(28),\n",
        "    batch_tfms=[]\n",
        ")\n"
      ],
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "PsLQM9HiTymq"
      },
      "source": [
        "dls = data.dataloaders(path, bs=64)"
      ],
      "execution_count": 4,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "-DwvUtHoCUhZ",
        "outputId": "efbe1f59-2046-4ca7-d294-9ba0a19a804a"
      },
      "source": [
        "len(dls.train.items), len(dls.train)"
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(60000, 937)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 5
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0Lfg2BUmT5Hr"
      },
      "source": [
        "learner = cnn_learner(dls, resnet18, metrics=accuracy, concat_pool=False)"
      ],
      "execution_count": 6,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 221
        },
        "id": "THxBBaYLYmM6",
        "outputId": "0d882859-6649-47f4-e6bd-f961c7867e10"
      },
      "source": [
        "learner.xla_fit(5, lr=3e-2)"
      ],
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "start fit\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: left;\">\n",
              "      <th>epoch</th>\n",
              "      <th>train_loss</th>\n",
              "      <th>valid_loss</th>\n",
              "      <th>accuracy</th>\n",
              "      <th>time</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <td>0</td>\n",
              "      <td>0.194564</td>\n",
              "      <td>0.071696</td>\n",
              "      <td>0.977500</td>\n",
              "      <td>00:31</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>1</td>\n",
              "      <td>0.113245</td>\n",
              "      <td>0.135002</td>\n",
              "      <td>0.958900</td>\n",
              "      <td>00:29</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>2</td>\n",
              "      <td>0.072109</td>\n",
              "      <td>0.084309</td>\n",
              "      <td>0.972900</td>\n",
              "      <td>00:29</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>3</td>\n",
              "      <td>0.096696</td>\n",
              "      <td>0.226016</td>\n",
              "      <td>0.932500</td>\n",
              "      <td>00:29</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>4</td>\n",
              "      <td>0.083080</td>\n",
              "      <td>0.159650</td>\n",
              "      <td>0.954700</td>\n",
              "      <td>00:29</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>"
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {
            "tags": []
          }
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "dAVij2ZpQZFa"
      },
      "source": [
        "learner.unfreeze()"
      ],
      "execution_count": 8,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BElcfjk5njZL"
      },
      "source": [
        "n_epoch = 2\n",
        "num_cores = 8\n",
        "\n",
        "ctrl_args = learner.pre_xla_fit()\n",
        "learner_args, add_args = learner.pack_learner_args()\n",
        "fit_args={'lr_max':slice(1e-4)}\n",
        "fit_args['n_epoch'] = n_epoch"
      ],
      "execution_count": 9,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "nmZpxWherAKU"
      },
      "source": [
        "import torch_xla.distributed.xla_multiprocessing as xmp"
      ],
      "execution_count": 10,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "pMkHmHiFqbIE",
        "outputId": "eeae564f-2b92-4e77-b609-df7842398907"
      },
      "source": [
        "xmp.spawn(xla_run_method,\n",
        "          args=(Learner.fit_one_cycle, learner_args, add_args, fit_args, ctrl_args),\n",
        "          nprocs=num_cores,\n",
        "              start_method='fork')"
      ],
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "start fit\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "    <div>\n",
              "        <style>\n",
              "            /* Turns off some styling */\n",
              "            progress {\n",
              "                /* gets rid of default border in Firefox and Opera. */\n",
              "                border: none;\n",
              "                /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
              "                background-size: auto;\n",
              "            }\n",
              "            .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
              "                background: #F44336;\n",
              "            }\n",
              "        </style>\n",
              "      <progress value='1' class='' max='2' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      50.00% [1/2 00:40<00:40]\n",
              "    </div>\n",
              "    \n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: left;\">\n",
              "      <th>epoch</th>\n",
              "      <th>train_loss</th>\n",
              "      <th>valid_loss</th>\n",
              "      <th>accuracy</th>\n",
              "      <th>time</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <td>0</td>\n",
              "      <td>0.058910</td>\n",
              "      <td>0.029479</td>\n",
              "      <td>0.990200</td>\n",
              "      <td>00:40</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table><p>\n",
              "\n",
              "    <div>\n",
              "        <style>\n",
              "            /* Turns off some styling */\n",
              "            progress {\n",
              "                /* gets rid of default border in Firefox and Opera. */\n",
              "                border: none;\n",
              "                /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
              "                background-size: auto;\n",
              "            }\n",
              "            .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
              "                background: #F44336;\n",
              "            }\n",
              "        </style>\n",
              "      <progress value='59' class='' max='118' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      50.00% [59/118 00:16<00:16 0.0564]\n",
              "    </div>\n",
              "    "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "Exception in device=TPU:7: index 3 is out of bounds for dimension 0 with size 3\n",
            "Exception in device=TPU:2: index 3 is out of bounds for dimension 0 with size 3\n",
            "Exception in device=TPU:3: index 3 is out of bounds for dimension 0 with size 3\n",
            "Exception in device=TPU:5: index 3 is out of bounds for dimension 0 with size 3\n",
            "Exception in device=TPU:4: index 3 is out of bounds for dimension 0 with size 3\n",
            "Exception in device=TPU:1: index 3 is out of bounds for dimension 0 with size 3\n",
            "Traceback (most recent call last):\n",
            "Traceback (most recent call last):\n",
            "Exception in device=TPU:6: index 3 is out of bounds for dimension 0 with size 3\n",
            "Traceback (most recent call last):\n",
            "Traceback (most recent call last):\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n",
            "    _start_fn(index, pf_cfg, fn, args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n",
            "    _start_fn(index, pf_cfg, fn, args)\n",
            "Traceback (most recent call last):\n",
            "Exception in device=TPU:0: index 3 is out of bounds for dimension 0 with size 3\n",
            "Traceback (most recent call last):\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n",
            "    _start_fn(index, pf_cfg, fn, args)\n",
            "Traceback (most recent call last):\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n",
            "    _start_fn(index, pf_cfg, fn, args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n",
            "    _start_fn(index, pf_cfg, fn, args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n",
            "    fn(gindex, *args)\n",
            "Traceback (most recent call last):\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n",
            "    fn(gindex, *args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n",
            "    _start_fn(index, pf_cfg, fn, args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n",
            "    fn(gindex, *args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n",
            "    fn(gindex, *args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n",
            "    _start_fn(index, pf_cfg, fn, args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n",
            "    fn(gindex, *args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n",
            "    fit_method(learner, **fit_args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n",
            "    fit_method(learner, **fit_args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n",
            "    fn(gindex, *args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n",
            "    fit_method(learner, **fit_args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n",
            "    _start_fn(index, pf_cfg, fn, args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n",
            "    fit_method(learner, **fit_args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n",
            "    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n",
            "    fit_method(learner, **fit_args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n",
            "    fit_method(learner, **fit_args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n",
            "    fn(gindex, *args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n",
            "    self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n",
            "    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n",
            "    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n",
            "    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n",
            "    fn(gindex, *args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n",
            "    fit_method(learner, **fit_args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n",
            "    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n",
            "    self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n",
            "    self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n",
            "    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n",
            "    self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n",
            "    fit_method(learner, **fit_args)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n",
            "    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n",
            "    self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n",
            "    self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n",
            "    self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n",
            "    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n",
            "    self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n",
            "    self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n",
            "    self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n",
            "    self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n",
            "    self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n",
            "    self._do_epoch_train()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n",
            "    self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n",
            "    self._do_epoch_train()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n",
            "    self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n",
            "    self._do_epoch_train()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n",
            "    self._with_events(self.all_batches, 'train', CancelTrainException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n",
            "    self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n",
            "    self._do_epoch_train()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n",
            "    self._with_events(self.all_batches, 'train', CancelTrainException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n",
            "    self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n",
            "    self._with_events(self.all_batches, 'train', CancelTrainException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n",
            "    self._do_epoch_train()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n",
            "    self._with_events(self.all_batches, 'train', CancelTrainException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n",
            "    self._do_epoch_train()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n",
            "    for o in enumerate(self.dl): self.one_batch(*o)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n",
            "    self._do_epoch_train()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n",
            "    self._with_events(self.all_batches, 'train', CancelTrainException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n",
            "    for o in enumerate(self.dl): self.one_batch(*o)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n",
            "    self._do_epoch_train()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n",
            "    self._with_events(self.all_batches, 'train', CancelTrainException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n",
            "    self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n",
            "    self._with_events(self.all_batches, 'train', CancelTrainException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n",
            "    self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n",
            "    for o in enumerate(self.dl): self.one_batch(*o)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n",
            "    for o in enumerate(self.dl): self.one_batch(*o)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n",
            "    self._with_events(self.all_batches, 'train', CancelTrainException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n",
            "    def __call__(self, event_name): L(event_name).map(self._call_one)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n",
            "    def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n",
            "    self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n",
            "    def __call__(self, event_name): L(event_name).map(self._call_one)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n",
            "    self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n",
            "    for o in enumerate(self.dl): self.one_batch(*o)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n",
            "    for o in enumerate(self.dl): self.one_batch(*o)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n",
            "    for o in enumerate(self.dl): self.one_batch(*o)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n",
            "    for o in enumerate(self.dl): self.one_batch(*o)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n",
            "    def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n",
            "IndexError: index 3 is out of bounds for dimension 0 with size 3\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n",
            "    self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n",
            "    self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n",
            "    self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n",
            "IndexError: index 3 is out of bounds for dimension 0 with size 3\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n",
            "    def __call__(self, event_name): L(event_name).map(self._call_one)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n",
            "    def __call__(self, event_name): L(event_name).map(self._call_one)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n",
            "    self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n",
            "    def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n",
            "    def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n",
            "    def __call__(self, event_name): L(event_name).map(self._call_one)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
            "    try: self(f'before_{event_type}');  f()\n",
            "IndexError: index 3 is out of bounds for dimension 0 with size 3\n",
            "IndexError: index 3 is out of bounds for dimension 0 with size 3\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n",
            "    def __call__(self, event_name): L(event_name).map(self._call_one)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n",
            "    def __call__(self, event_name): L(event_name).map(self._call_one)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n",
            "    def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n",
            "    def __call__(self, event_name): L(event_name).map(self._call_one)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n",
            "    def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n",
            "    def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n",
            "    def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n",
            "IndexError: index 3 is out of bounds for dimension 0 with size 3\n",
            "Exception in thread Thread-8:\n",
            "Traceback (most recent call last):\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 872, in _try_get_data\n",
            "    data = self._data_queue.get(timeout=timeout)\n",
            "  File \"/usr/lib/python3.6/multiprocessing/queues.py\", line 113, in get\n",
            "    return _ForkingPickler.loads(res)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch/multiprocessing/reductions.py\", line 282, in rebuild_storage_fd\n",
            "    fd = df.detach()\n",
            "  File \"/usr/lib/python3.6/multiprocessing/resource_sharer.py\", line 57, in detach\n",
            "    with _resource_sharer.get_connection(self._id) as conn:\n",
            "  File \"/usr/lib/python3.6/multiprocessing/resource_sharer.py\", line 87, in get_connection\n",
            "    c = Client(address, authkey=process.current_process().authkey)\n",
            "  File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 493, in Client\n",
            "    answer_challenge(c, authkey)\n",
            "  File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 732, in answer_challenge\n",
            "    message = connection.recv_bytes(256)         # reject large message\n",
            "  File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 216, in recv_bytes\n",
            "    buf = self._recv_bytes(maxlength)\n",
            "  File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 407, in _recv_bytes\n",
            "    buf = self._recv(4)\n",
            "  File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 379, in _recv\n",
            "    chunk = read(handle, remaining)\n",
            "ConnectionResetError: [Errno 104] Connection reset by peer\n",
            "\n",
            "The above exception was the direct cause of the following exception:\n",
            "\n",
            "Traceback (most recent call last):\n",
            "  File \"/usr/lib/python3.6/threading.py\", line 916, in _bootstrap_inner\n",
            "    self.run()\n",
            "  File \"/usr/lib/python3.6/threading.py\", line 864, in run\n",
            "    self._target(*self._args, **self._kwargs)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/parallel_loader.py\", line 141, in _loader_worker\n",
            "    _, data = next(data_iter)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/data/load.py\", line 101, in __iter__\n",
            "    for b in _loaders[self.fake_l.num_workers==0](self.fake_l):\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 435, in __next__\n",
            "    data = self._next_data()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 1068, in _next_data\n",
            "    idx, data = self._get_data()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 1034, in _get_data\n",
            "    success, data = self._try_get_data()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 885, in _try_get_data\n",
            "    raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str)) from e\n",
            "RuntimeError: DataLoader worker (pid(s) 28531, 28543, 28576, 28675, 28687) exited unexpectedly\n",
            "IndexError: index 3 is out of bounds for dimension 0 with size 3\n",
            "IndexError: index 3 is out of bounds for dimension 0 with size 3\n",
            "IndexError: index 3 is out of bounds for dimension 0 with size 3\n",
            "\n",
            "Exception in thread Thread-8:\n",
            "Traceback (most recent call last):\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 872, in _try_get_data\n",
            "    data = self._data_queue.get(timeout=timeout)\n",
            "  File \"/usr/lib/python3.6/multiprocessing/queues.py\", line 113, in get\n",
            "    return _ForkingPickler.loads(res)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch/multiprocessing/reductions.py\", line 282, in rebuild_storage_fd\n",
            "    fd = df.detach()\n",
            "  File \"/usr/lib/python3.6/multiprocessing/resource_sharer.py\", line 57, in detach\n",
            "    with _resource_sharer.get_connection(self._id) as conn:\n",
            "  File \"/usr/lib/python3.6/multiprocessing/resource_sharer.py\", line 87, in get_connection\n",
            "    c = Client(address, authkey=process.current_process().authkey)\n",
            "  File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 487, in Client\n",
            "    c = SocketClient(address)\n",
            "  File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 614, in SocketClient\n",
            "    s.connect(address)\n",
            "ConnectionRefusedError: [Errno 111] Connection refused\n",
            "\n",
            "The above exception was the direct cause of the following exception:\n",
            "\n",
            "Traceback (most recent call last):\n",
            "  File \"/usr/lib/python3.6/threading.py\", line 916, in _bootstrap_inner\n",
            "    self.run()\n",
            "  File \"/usr/lib/python3.6/threading.py\", line 864, in run\n",
            "    self._target(*self._args, **self._kwargs)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/parallel_loader.py\", line 141, in _loader_worker\n",
            "    _, data = next(data_iter)\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/fastai/data/load.py\", line 101, in __iter__\n",
            "    for b in _loaders[self.fake_l.num_workers==0](self.fake_l):\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 435, in __next__\n",
            "    data = self._next_data()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 1068, in _next_data\n",
            "    idx, data = self._get_data()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 1034, in _get_data\n",
            "    success, data = self._try_get_data()\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 885, in _try_get_data\n",
            "    raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str)) from e\n",
            "RuntimeError: DataLoader worker (pid(s) 28571, 28579, 28596, 28619, 28665, 28677, 28684, 28692, 28703, 28738, 28768, 28777, 28785, 28797, 28805) exited unexpectedly\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "error",
          "ename": "Exception",
          "evalue": "ignored",
          "traceback": [
            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
            "\u001b[0;31mException\u001b[0m                                 Traceback (most recent call last)",
            "\u001b[0;32m<ipython-input-11-7ef8d746365d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      2\u001b[0m           \u001b[0margs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mLearner\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_one_cycle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlearner_args\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0madd_args\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfit_args\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mctrl_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m           \u001b[0mnprocs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnum_cores\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m               start_method='fork')\n\u001b[0m",
            "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\u001b[0m in \u001b[0;36mspawn\u001b[0;34m(fn, args, nprocs, join, daemon, start_method)\u001b[0m\n\u001b[1;32m    393\u001b[0m         \u001b[0mjoin\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    394\u001b[0m         \u001b[0mdaemon\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdaemon\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 395\u001b[0;31m         start_method=start_method)\n\u001b[0m\u001b[1;32m    396\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    397\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/multiprocessing/spawn.py\u001b[0m in \u001b[0;36mstart_processes\u001b[0;34m(fn, args, nprocs, join, daemon, start_method)\u001b[0m\n\u001b[1;32m    155\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    156\u001b[0m     \u001b[0;31m# Loop on join until it returns True or raises an exception.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 157\u001b[0;31m     \u001b[0;32mwhile\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mcontext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    158\u001b[0m         \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    159\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/multiprocessing/spawn.py\u001b[0m in \u001b[0;36mjoin\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    110\u001b[0m                 raise Exception(\n\u001b[1;32m    111\u001b[0m                     \u001b[0;34m\"process %d terminated with exit code %d\"\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 112\u001b[0;31m                     \u001b[0;34m(\u001b[0m\u001b[0merror_index\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexitcode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    113\u001b[0m                 )\n\u001b[1;32m    114\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;31mException\u001b[0m: process 7 terminated with exit code 17"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "66CdLIQQQdE6"
      },
      "source": [
        "# learner.xla_fit_one_cycle(2, lr_max=slice(1e-4), num_cores=8)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "LcPCdD_Qgwtu"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}