Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save rlandingin/714f637b927313d0d6898b721ff0bc35 to your computer and use it in GitHub Desktop.
Save rlandingin/714f637b927313d0d6898b721ff0bc35 to your computer and use it in GitHub Desktop.
fast-multi-core-tpu-mnist-training-bug.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"accelerator": "TPU",
"colab": {
"name": "fast-multi-core-tpu-mnist-training-bug.ipynb",
"provenance": [],
"collapsed_sections": [],
"toc_visible": true,
"machine_shape": "hm",
"include_colab_link": true
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/rlandingin/714f637b927313d0d6898b721ff0bc35/fast-multi-core-tpu-mnist-training-bug.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "dn7xM36f7FxM",
"outputId": "78a0d5a3-d055-4906-c985-74c05a23744d"
},
"source": [
"!pip install -Uqq cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.7-cp36-cp36m-linux_x86_64.whl"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"\u001b[K |████████████████████████████████| 133.6MB 27kB/s \n",
"\u001b[K |████████████████████████████████| 61kB 3.7MB/s \n",
"\u001b[?25h"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "zK0jJDxT7FxP",
"outputId": "2fc4e0c5-4845-49fa-f375-f0f807173f4d"
},
"source": [
"# !pip install -Uqq git+https://github.com/fastai/fastai.git \n",
"!pip install -Uqq fastai --upgrade"
],
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"text": [
"\u001b[K |████████████████████████████████| 194kB 6.6MB/s \n",
"\u001b[K |████████████████████████████████| 61kB 6.4MB/s \n",
"\u001b[?25h"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "SRRtia2_7FxQ",
"outputId": "4519728d-61ad-419d-a725-29d68bb22756"
},
"source": [
"!pip install -Uqq git+https://github.com/butchland/fastai_xla_extensions.git --upgrade"
],
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"text": [
" Building wheel for fastai-xla-extensions (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "MY92w9-m7FxQ",
"outputId": "65708d5b-a8ad-43c1-e091-1aa44e1df834"
},
"source": [
"!pip install -Uqq git+https://github.com/butchland/my_timesaver_utils.git"
],
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"text": [
" Building wheel for my-timesaver-utils (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "KXmksi5i7FxR",
"outputId": "10beaa3a-bbdd-4aea-9f43-143f18ec2620"
},
"source": [
"!curl -s https://course19.fast.ai/setup/colab | bash"
],
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"text": [
"Updating fastai...\n",
"Done.\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "epCpujMw7FxS",
"outputId": "fd81577f-4f69-4864-a0ea-bf1a491e68e0"
},
"source": [
"!pip freeze | grep torch\n",
"!pip freeze | grep fast"
],
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"text": [
"torch==1.7.0+cu101\n",
"torch-xla==1.7\n",
"torchsummary==1.5.1\n",
"torchtext==0.3.1\n",
"torchvision==0.8.1+cu101\n",
"fastai==2.2.5\n",
"fastai-xla-extensions==0.0.8\n",
"fastcore==1.3.19\n",
"fastdtw==0.3.4\n",
"fastprogress==1.0.0\n",
"fastrlock==0.5\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "GBSZSW_rcojV",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "6df3c7ff-18a7-43e7-e14b-371758a5ad93"
},
"source": [
"from fastai.vision.all import *\n",
"from fastai_xla_extensions.all import *\n",
"# import torch_xla.core.xla_model as xm\n",
"# import torch_xla"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"WARNING:root:TPU has started up successfully with version pytorch-1.7\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "d_D86ji8TR7t"
},
"source": [
"path = untar_data(URLs.MNIST)\n",
"# path = untar_data(URLs.MNIST_TINY)"
],
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "m-8sge7vTXX-"
},
"source": [
"data = DataBlock(\n",
" blocks=(ImageBlock, CategoryBlock),\n",
" get_items=get_image_files,\n",
" get_y=parent_label,\n",
" splitter=GrandparentSplitter(valid_name='testing', train_name='training'),\n",
" item_tfms=Resize(28),\n",
" batch_tfms=[]\n",
")\n"
],
"execution_count": 3,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "PsLQM9HiTymq"
},
"source": [
"dls = data.dataloaders(path, bs=64)"
],
"execution_count": 4,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-DwvUtHoCUhZ",
"outputId": "efbe1f59-2046-4ca7-d294-9ba0a19a804a"
},
"source": [
"len(dls.train.items), len(dls.train)"
],
"execution_count": 5,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(60000, 937)"
]
},
"metadata": {
"tags": []
},
"execution_count": 5
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "0Lfg2BUmT5Hr"
},
"source": [
"learner = cnn_learner(dls, resnet18, metrics=accuracy, concat_pool=False)"
],
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 221
},
"id": "THxBBaYLYmM6",
"outputId": "0d882859-6649-47f4-e6bd-f961c7867e10"
},
"source": [
"learner.xla_fit(5, lr=3e-2)"
],
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"text": [
"start fit\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: left;\">\n",
" <th>epoch</th>\n",
" <th>train_loss</th>\n",
" <th>valid_loss</th>\n",
" <th>accuracy</th>\n",
" <th>time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>0.194564</td>\n",
" <td>0.071696</td>\n",
" <td>0.977500</td>\n",
" <td>00:31</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0.113245</td>\n",
" <td>0.135002</td>\n",
" <td>0.958900</td>\n",
" <td>00:29</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>0.072109</td>\n",
" <td>0.084309</td>\n",
" <td>0.972900</td>\n",
" <td>00:29</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0.096696</td>\n",
" <td>0.226016</td>\n",
" <td>0.932500</td>\n",
" <td>00:29</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>0.083080</td>\n",
" <td>0.159650</td>\n",
" <td>0.954700</td>\n",
" <td>00:29</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {
"tags": []
}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "dAVij2ZpQZFa"
},
"source": [
"learner.unfreeze()"
],
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "BElcfjk5njZL"
},
"source": [
"n_epoch = 2\n",
"num_cores = 8\n",
"\n",
"ctrl_args = learner.pre_xla_fit()\n",
"learner_args, add_args = learner.pack_learner_args()\n",
"fit_args={'lr_max':slice(1e-4)}\n",
"fit_args['n_epoch'] = n_epoch"
],
"execution_count": 9,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "nmZpxWherAKU"
},
"source": [
"import torch_xla.distributed.xla_multiprocessing as xmp"
],
"execution_count": 10,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "pMkHmHiFqbIE",
"outputId": "eeae564f-2b92-4e77-b609-df7842398907"
},
"source": [
"xmp.spawn(xla_run_method,\n",
" args=(Learner.fit_one_cycle, learner_args, add_args, fit_args, ctrl_args),\n",
" nprocs=num_cores,\n",
" start_method='fork')"
],
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"text": [
"start fit\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <div>\n",
" <style>\n",
" /* Turns off some styling */\n",
" progress {\n",
" /* gets rid of default border in Firefox and Opera. */\n",
" border: none;\n",
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
" background-size: auto;\n",
" }\n",
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
" background: #F44336;\n",
" }\n",
" </style>\n",
" <progress value='1' class='' max='2' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" 50.00% [1/2 00:40<00:40]\n",
" </div>\n",
" \n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: left;\">\n",
" <th>epoch</th>\n",
" <th>train_loss</th>\n",
" <th>valid_loss</th>\n",
" <th>accuracy</th>\n",
" <th>time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>0.058910</td>\n",
" <td>0.029479</td>\n",
" <td>0.990200</td>\n",
" <td>00:40</td>\n",
" </tr>\n",
" </tbody>\n",
"</table><p>\n",
"\n",
" <div>\n",
" <style>\n",
" /* Turns off some styling */\n",
" progress {\n",
" /* gets rid of default border in Firefox and Opera. */\n",
" border: none;\n",
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
" background-size: auto;\n",
" }\n",
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
" background: #F44336;\n",
" }\n",
" </style>\n",
" <progress value='59' class='' max='118' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" 50.00% [59/118 00:16<00:16 0.0564]\n",
" </div>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"Exception in device=TPU:7: index 3 is out of bounds for dimension 0 with size 3\n",
"Exception in device=TPU:2: index 3 is out of bounds for dimension 0 with size 3\n",
"Exception in device=TPU:3: index 3 is out of bounds for dimension 0 with size 3\n",
"Exception in device=TPU:5: index 3 is out of bounds for dimension 0 with size 3\n",
"Exception in device=TPU:4: index 3 is out of bounds for dimension 0 with size 3\n",
"Exception in device=TPU:1: index 3 is out of bounds for dimension 0 with size 3\n",
"Traceback (most recent call last):\n",
"Traceback (most recent call last):\n",
"Exception in device=TPU:6: index 3 is out of bounds for dimension 0 with size 3\n",
"Traceback (most recent call last):\n",
"Traceback (most recent call last):\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n",
" _start_fn(index, pf_cfg, fn, args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n",
" _start_fn(index, pf_cfg, fn, args)\n",
"Traceback (most recent call last):\n",
"Exception in device=TPU:0: index 3 is out of bounds for dimension 0 with size 3\n",
"Traceback (most recent call last):\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n",
" _start_fn(index, pf_cfg, fn, args)\n",
"Traceback (most recent call last):\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n",
" _start_fn(index, pf_cfg, fn, args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n",
" _start_fn(index, pf_cfg, fn, args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n",
" fn(gindex, *args)\n",
"Traceback (most recent call last):\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n",
" fn(gindex, *args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n",
" _start_fn(index, pf_cfg, fn, args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n",
" fn(gindex, *args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n",
" fn(gindex, *args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n",
" _start_fn(index, pf_cfg, fn, args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n",
" fn(gindex, *args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n",
" fit_method(learner, **fit_args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n",
" fit_method(learner, **fit_args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n",
" fn(gindex, *args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n",
" fit_method(learner, **fit_args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n",
" _start_fn(index, pf_cfg, fn, args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n",
" fit_method(learner, **fit_args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n",
" self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n",
" fit_method(learner, **fit_args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n",
" fit_method(learner, **fit_args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n",
" fn(gindex, *args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n",
" self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n",
" self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n",
" self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n",
" self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n",
" fn(gindex, *args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n",
" fit_method(learner, **fit_args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n",
" self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n",
" self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n",
" self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n",
" self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n",
" self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n",
" fit_method(learner, **fit_args)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n",
" self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n",
" self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n",
" self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n",
" self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n",
" self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n",
" self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n",
" self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n",
" self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n",
" self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n",
" self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n",
" self._do_epoch_train()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n",
" self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n",
" self._do_epoch_train()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n",
" self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n",
" self._do_epoch_train()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n",
" self._with_events(self.all_batches, 'train', CancelTrainException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n",
" self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n",
" self._do_epoch_train()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n",
" self._with_events(self.all_batches, 'train', CancelTrainException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n",
" self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n",
" self._with_events(self.all_batches, 'train', CancelTrainException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n",
" self._do_epoch_train()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n",
" self._with_events(self.all_batches, 'train', CancelTrainException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n",
" self._do_epoch_train()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n",
" for o in enumerate(self.dl): self.one_batch(*o)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n",
" self._do_epoch_train()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n",
" self._with_events(self.all_batches, 'train', CancelTrainException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n",
" for o in enumerate(self.dl): self.one_batch(*o)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n",
" self._do_epoch_train()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n",
" self._with_events(self.all_batches, 'train', CancelTrainException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n",
" self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n",
" self._with_events(self.all_batches, 'train', CancelTrainException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n",
" self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n",
" for o in enumerate(self.dl): self.one_batch(*o)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n",
" for o in enumerate(self.dl): self.one_batch(*o)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n",
" self._with_events(self.all_batches, 'train', CancelTrainException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n",
" def __call__(self, event_name): L(event_name).map(self._call_one)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n",
" def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n",
" self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n",
" def __call__(self, event_name): L(event_name).map(self._call_one)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n",
" self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n",
" for o in enumerate(self.dl): self.one_batch(*o)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n",
" for o in enumerate(self.dl): self.one_batch(*o)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n",
" for o in enumerate(self.dl): self.one_batch(*o)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n",
" for o in enumerate(self.dl): self.one_batch(*o)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n",
" def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n",
"IndexError: index 3 is out of bounds for dimension 0 with size 3\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n",
" self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n",
" self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n",
" self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n",
"IndexError: index 3 is out of bounds for dimension 0 with size 3\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n",
" def __call__(self, event_name): L(event_name).map(self._call_one)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n",
" def __call__(self, event_name): L(event_name).map(self._call_one)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n",
" self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n",
" def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n",
" def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n",
" def __call__(self, event_name): L(event_name).map(self._call_one)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n",
" try: self(f'before_{event_type}'); f()\n",
"IndexError: index 3 is out of bounds for dimension 0 with size 3\n",
"IndexError: index 3 is out of bounds for dimension 0 with size 3\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n",
" def __call__(self, event_name): L(event_name).map(self._call_one)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n",
" def __call__(self, event_name): L(event_name).map(self._call_one)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n",
" def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n",
" def __call__(self, event_name): L(event_name).map(self._call_one)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n",
" def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n",
" def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n",
" def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n",
"IndexError: index 3 is out of bounds for dimension 0 with size 3\n",
"Exception in thread Thread-8:\n",
"Traceback (most recent call last):\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 872, in _try_get_data\n",
" data = self._data_queue.get(timeout=timeout)\n",
" File \"/usr/lib/python3.6/multiprocessing/queues.py\", line 113, in get\n",
" return _ForkingPickler.loads(res)\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch/multiprocessing/reductions.py\", line 282, in rebuild_storage_fd\n",
" fd = df.detach()\n",
" File \"/usr/lib/python3.6/multiprocessing/resource_sharer.py\", line 57, in detach\n",
" with _resource_sharer.get_connection(self._id) as conn:\n",
" File \"/usr/lib/python3.6/multiprocessing/resource_sharer.py\", line 87, in get_connection\n",
" c = Client(address, authkey=process.current_process().authkey)\n",
" File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 493, in Client\n",
" answer_challenge(c, authkey)\n",
" File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 732, in answer_challenge\n",
" message = connection.recv_bytes(256) # reject large message\n",
" File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 216, in recv_bytes\n",
" buf = self._recv_bytes(maxlength)\n",
" File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 407, in _recv_bytes\n",
" buf = self._recv(4)\n",
" File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 379, in _recv\n",
" chunk = read(handle, remaining)\n",
"ConnectionResetError: [Errno 104] Connection reset by peer\n",
"\n",
"The above exception was the direct cause of the following exception:\n",
"\n",
"Traceback (most recent call last):\n",
" File \"/usr/lib/python3.6/threading.py\", line 916, in _bootstrap_inner\n",
" self.run()\n",
" File \"/usr/lib/python3.6/threading.py\", line 864, in run\n",
" self._target(*self._args, **self._kwargs)\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/parallel_loader.py\", line 141, in _loader_worker\n",
" _, data = next(data_iter)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/data/load.py\", line 101, in __iter__\n",
" for b in _loaders[self.fake_l.num_workers==0](self.fake_l):\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 435, in __next__\n",
" data = self._next_data()\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 1068, in _next_data\n",
" idx, data = self._get_data()\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 1034, in _get_data\n",
" success, data = self._try_get_data()\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 885, in _try_get_data\n",
" raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str)) from e\n",
"RuntimeError: DataLoader worker (pid(s) 28531, 28543, 28576, 28675, 28687) exited unexpectedly\n",
"IndexError: index 3 is out of bounds for dimension 0 with size 3\n",
"IndexError: index 3 is out of bounds for dimension 0 with size 3\n",
"IndexError: index 3 is out of bounds for dimension 0 with size 3\n",
"\n",
"Exception in thread Thread-8:\n",
"Traceback (most recent call last):\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 872, in _try_get_data\n",
" data = self._data_queue.get(timeout=timeout)\n",
" File \"/usr/lib/python3.6/multiprocessing/queues.py\", line 113, in get\n",
" return _ForkingPickler.loads(res)\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch/multiprocessing/reductions.py\", line 282, in rebuild_storage_fd\n",
" fd = df.detach()\n",
" File \"/usr/lib/python3.6/multiprocessing/resource_sharer.py\", line 57, in detach\n",
" with _resource_sharer.get_connection(self._id) as conn:\n",
" File \"/usr/lib/python3.6/multiprocessing/resource_sharer.py\", line 87, in get_connection\n",
" c = Client(address, authkey=process.current_process().authkey)\n",
" File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 487, in Client\n",
" c = SocketClient(address)\n",
" File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 614, in SocketClient\n",
" s.connect(address)\n",
"ConnectionRefusedError: [Errno 111] Connection refused\n",
"\n",
"The above exception was the direct cause of the following exception:\n",
"\n",
"Traceback (most recent call last):\n",
" File \"/usr/lib/python3.6/threading.py\", line 916, in _bootstrap_inner\n",
" self.run()\n",
" File \"/usr/lib/python3.6/threading.py\", line 864, in run\n",
" self._target(*self._args, **self._kwargs)\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/parallel_loader.py\", line 141, in _loader_worker\n",
" _, data = next(data_iter)\n",
" File \"/usr/local/lib/python3.6/dist-packages/fastai/data/load.py\", line 101, in __iter__\n",
" for b in _loaders[self.fake_l.num_workers==0](self.fake_l):\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 435, in __next__\n",
" data = self._next_data()\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 1068, in _next_data\n",
" idx, data = self._get_data()\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 1034, in _get_data\n",
" success, data = self._try_get_data()\n",
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 885, in _try_get_data\n",
" raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str)) from e\n",
"RuntimeError: DataLoader worker (pid(s) 28571, 28579, 28596, 28619, 28665, 28677, 28684, 28692, 28703, 28738, 28768, 28777, 28785, 28797, 28805) exited unexpectedly\n"
],
"name": "stderr"
},
{
"output_type": "error",
"ename": "Exception",
"evalue": "ignored",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mException\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-11-7ef8d746365d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mLearner\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_one_cycle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlearner_args\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0madd_args\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfit_args\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mctrl_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mnprocs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnum_cores\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m start_method='fork')\n\u001b[0m",
"\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\u001b[0m in \u001b[0;36mspawn\u001b[0;34m(fn, args, nprocs, join, daemon, start_method)\u001b[0m\n\u001b[1;32m 393\u001b[0m \u001b[0mjoin\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 394\u001b[0m \u001b[0mdaemon\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdaemon\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 395\u001b[0;31m start_method=start_method)\n\u001b[0m\u001b[1;32m 396\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 397\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/multiprocessing/spawn.py\u001b[0m in \u001b[0;36mstart_processes\u001b[0;34m(fn, args, nprocs, join, daemon, start_method)\u001b[0m\n\u001b[1;32m 155\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 156\u001b[0m \u001b[0;31m# Loop on join until it returns True or raises an exception.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 157\u001b[0;31m \u001b[0;32mwhile\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mcontext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 158\u001b[0m \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/multiprocessing/spawn.py\u001b[0m in \u001b[0;36mjoin\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 110\u001b[0m raise Exception(\n\u001b[1;32m 111\u001b[0m \u001b[0;34m\"process %d terminated with exit code %d\"\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 112\u001b[0;31m \u001b[0;34m(\u001b[0m\u001b[0merror_index\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexitcode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 113\u001b[0m )\n\u001b[1;32m 114\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mException\u001b[0m: process 7 terminated with exit code 17"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "66CdLIQQQdE6"
},
"source": [
"# learner.xla_fit_one_cycle(2, lr_max=slice(1e-4), num_cores=8)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "LcPCdD_Qgwtu"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment