Created
February 15, 2021 13:06
-
-
Save rlandingin/714f637b927313d0d6898b721ff0bc35 to your computer and use it in GitHub Desktop.
fast-multi-core-tpu-mnist-training-bug.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"accelerator": "TPU", | |
"colab": { | |
"name": "fast-multi-core-tpu-mnist-training-bug.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"toc_visible": true, | |
"machine_shape": "hm", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.7" | |
}, | |
"varInspector": { | |
"cols": { | |
"lenName": 16, | |
"lenType": 16, | |
"lenVar": 40 | |
}, | |
"kernels_config": { | |
"python": { | |
"delete_cmd_postfix": "", | |
"delete_cmd_prefix": "del ", | |
"library": "var_list.py", | |
"varRefreshCmd": "print(var_dic_list())" | |
}, | |
"r": { | |
"delete_cmd_postfix": ") ", | |
"delete_cmd_prefix": "rm(", | |
"library": "var_list.r", | |
"varRefreshCmd": "cat(var_dic_list()) " | |
} | |
}, | |
"types_to_exclude": [ | |
"module", | |
"function", | |
"builtin_function_or_method", | |
"instance", | |
"_Feature" | |
], | |
"window_display": false | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/rlandingin/714f637b927313d0d6898b721ff0bc35/fast-multi-core-tpu-mnist-training-bug.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "dn7xM36f7FxM", | |
"outputId": "78a0d5a3-d055-4906-c985-74c05a23744d" | |
}, | |
"source": [ | |
"!pip install -Uqq cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.7-cp36-cp36m-linux_x86_64.whl" | |
], | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"\u001b[K |████████████████████████████████| 133.6MB 27kB/s \n", | |
"\u001b[K |████████████████████████████████| 61kB 3.7MB/s \n", | |
"\u001b[?25h" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "zK0jJDxT7FxP", | |
"outputId": "2fc4e0c5-4845-49fa-f375-f0f807173f4d" | |
}, | |
"source": [ | |
"# !pip install -Uqq git+https://github.com/fastai/fastai.git \n", | |
"!pip install -Uqq fastai --upgrade" | |
], | |
"execution_count": 2, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"\u001b[K |████████████████████████████████| 194kB 6.6MB/s \n", | |
"\u001b[K |████████████████████████████████| 61kB 6.4MB/s \n", | |
"\u001b[?25h" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "SRRtia2_7FxQ", | |
"outputId": "4519728d-61ad-419d-a725-29d68bb22756" | |
}, | |
"source": [ | |
"!pip install -Uqq git+https://github.com/butchland/fastai_xla_extensions.git --upgrade" | |
], | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
" Building wheel for fastai-xla-extensions (setup.py) ... \u001b[?25l\u001b[?25hdone\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "MY92w9-m7FxQ", | |
"outputId": "65708d5b-a8ad-43c1-e091-1aa44e1df834" | |
}, | |
"source": [ | |
"!pip install -Uqq git+https://github.com/butchland/my_timesaver_utils.git" | |
], | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
" Building wheel for my-timesaver-utils (setup.py) ... \u001b[?25l\u001b[?25hdone\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "KXmksi5i7FxR", | |
"outputId": "10beaa3a-bbdd-4aea-9f43-143f18ec2620" | |
}, | |
"source": [ | |
"!curl -s https://course19.fast.ai/setup/colab | bash" | |
], | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Updating fastai...\n", | |
"Done.\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "epCpujMw7FxS", | |
"outputId": "fd81577f-4f69-4864-a0ea-bf1a491e68e0" | |
}, | |
"source": [ | |
"!pip freeze | grep torch\n", | |
"!pip freeze | grep fast" | |
], | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"torch==1.7.0+cu101\n", | |
"torch-xla==1.7\n", | |
"torchsummary==1.5.1\n", | |
"torchtext==0.3.1\n", | |
"torchvision==0.8.1+cu101\n", | |
"fastai==2.2.5\n", | |
"fastai-xla-extensions==0.0.8\n", | |
"fastcore==1.3.19\n", | |
"fastdtw==0.3.4\n", | |
"fastprogress==1.0.0\n", | |
"fastrlock==0.5\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "GBSZSW_rcojV", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "6df3c7ff-18a7-43e7-e14b-371758a5ad93" | |
}, | |
"source": [ | |
"from fastai.vision.all import *\n", | |
"from fastai_xla_extensions.all import *\n", | |
"# import torch_xla.core.xla_model as xm\n", | |
"# import torch_xla" | |
], | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"WARNING:root:TPU has started up successfully with version pytorch-1.7\n" | |
], | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "d_D86ji8TR7t" | |
}, | |
"source": [ | |
"path = untar_data(URLs.MNIST)\n", | |
"# path = untar_data(URLs.MNIST_TINY)" | |
], | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "m-8sge7vTXX-" | |
}, | |
"source": [ | |
"data = DataBlock(\n", | |
" blocks=(ImageBlock, CategoryBlock),\n", | |
" get_items=get_image_files,\n", | |
" get_y=parent_label,\n", | |
" splitter=GrandparentSplitter(valid_name='testing', train_name='training'),\n", | |
" item_tfms=Resize(28),\n", | |
" batch_tfms=[]\n", | |
")\n" | |
], | |
"execution_count": 3, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "PsLQM9HiTymq" | |
}, | |
"source": [ | |
"dls = data.dataloaders(path, bs=64)" | |
], | |
"execution_count": 4, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "-DwvUtHoCUhZ", | |
"outputId": "efbe1f59-2046-4ca7-d294-9ba0a19a804a" | |
}, | |
"source": [ | |
"len(dls.train.items), len(dls.train)" | |
], | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(60000, 937)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 5 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "0Lfg2BUmT5Hr" | |
}, | |
"source": [ | |
"learner = cnn_learner(dls, resnet18, metrics=accuracy, concat_pool=False)" | |
], | |
"execution_count": 6, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 221 | |
}, | |
"id": "THxBBaYLYmM6", | |
"outputId": "0d882859-6649-47f4-e6bd-f961c7867e10" | |
}, | |
"source": [ | |
"learner.xla_fit(5, lr=3e-2)" | |
], | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"start fit\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/html": [ | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: left;\">\n", | |
" <th>epoch</th>\n", | |
" <th>train_loss</th>\n", | |
" <th>valid_loss</th>\n", | |
" <th>accuracy</th>\n", | |
" <th>time</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td>0</td>\n", | |
" <td>0.194564</td>\n", | |
" <td>0.071696</td>\n", | |
" <td>0.977500</td>\n", | |
" <td>00:31</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>1</td>\n", | |
" <td>0.113245</td>\n", | |
" <td>0.135002</td>\n", | |
" <td>0.958900</td>\n", | |
" <td>00:29</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>2</td>\n", | |
" <td>0.072109</td>\n", | |
" <td>0.084309</td>\n", | |
" <td>0.972900</td>\n", | |
" <td>00:29</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>3</td>\n", | |
" <td>0.096696</td>\n", | |
" <td>0.226016</td>\n", | |
" <td>0.932500</td>\n", | |
" <td>00:29</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>4</td>\n", | |
" <td>0.083080</td>\n", | |
" <td>0.159650</td>\n", | |
" <td>0.954700</td>\n", | |
" <td>00:29</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "dAVij2ZpQZFa" | |
}, | |
"source": [ | |
"learner.unfreeze()" | |
], | |
"execution_count": 8, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "BElcfjk5njZL" | |
}, | |
"source": [ | |
"n_epoch = 2\n", | |
"num_cores = 8\n", | |
"\n", | |
"ctrl_args = learner.pre_xla_fit()\n", | |
"learner_args, add_args = learner.pack_learner_args()\n", | |
"fit_args={'lr_max':slice(1e-4)}\n", | |
"fit_args['n_epoch'] = n_epoch" | |
], | |
"execution_count": 9, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "nmZpxWherAKU" | |
}, | |
"source": [ | |
"import torch_xla.distributed.xla_multiprocessing as xmp" | |
], | |
"execution_count": 10, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 1000 | |
}, | |
"id": "pMkHmHiFqbIE", | |
"outputId": "eeae564f-2b92-4e77-b609-df7842398907" | |
}, | |
"source": [ | |
"xmp.spawn(xla_run_method,\n", | |
" args=(Learner.fit_one_cycle, learner_args, add_args, fit_args, ctrl_args),\n", | |
" nprocs=num_cores,\n", | |
" start_method='fork')" | |
], | |
"execution_count": 11, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"start fit\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/html": [ | |
"\n", | |
" <div>\n", | |
" <style>\n", | |
" /* Turns off some styling */\n", | |
" progress {\n", | |
" /* gets rid of default border in Firefox and Opera. */\n", | |
" border: none;\n", | |
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n", | |
" background-size: auto;\n", | |
" }\n", | |
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n", | |
" background: #F44336;\n", | |
" }\n", | |
" </style>\n", | |
" <progress value='1' class='' max='2' style='width:300px; height:20px; vertical-align: middle;'></progress>\n", | |
" 50.00% [1/2 00:40<00:40]\n", | |
" </div>\n", | |
" \n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: left;\">\n", | |
" <th>epoch</th>\n", | |
" <th>train_loss</th>\n", | |
" <th>valid_loss</th>\n", | |
" <th>accuracy</th>\n", | |
" <th>time</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td>0</td>\n", | |
" <td>0.058910</td>\n", | |
" <td>0.029479</td>\n", | |
" <td>0.990200</td>\n", | |
" <td>00:40</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table><p>\n", | |
"\n", | |
" <div>\n", | |
" <style>\n", | |
" /* Turns off some styling */\n", | |
" progress {\n", | |
" /* gets rid of default border in Firefox and Opera. */\n", | |
" border: none;\n", | |
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n", | |
" background-size: auto;\n", | |
" }\n", | |
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n", | |
" background: #F44336;\n", | |
" }\n", | |
" </style>\n", | |
" <progress value='59' class='' max='118' style='width:300px; height:20px; vertical-align: middle;'></progress>\n", | |
" 50.00% [59/118 00:16<00:16 0.0564]\n", | |
" </div>\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
}, | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Exception in device=TPU:7: index 3 is out of bounds for dimension 0 with size 3\n", | |
"Exception in device=TPU:2: index 3 is out of bounds for dimension 0 with size 3\n", | |
"Exception in device=TPU:3: index 3 is out of bounds for dimension 0 with size 3\n", | |
"Exception in device=TPU:5: index 3 is out of bounds for dimension 0 with size 3\n", | |
"Exception in device=TPU:4: index 3 is out of bounds for dimension 0 with size 3\n", | |
"Exception in device=TPU:1: index 3 is out of bounds for dimension 0 with size 3\n", | |
"Traceback (most recent call last):\n", | |
"Traceback (most recent call last):\n", | |
"Exception in device=TPU:6: index 3 is out of bounds for dimension 0 with size 3\n", | |
"Traceback (most recent call last):\n", | |
"Traceback (most recent call last):\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n", | |
" _start_fn(index, pf_cfg, fn, args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n", | |
" _start_fn(index, pf_cfg, fn, args)\n", | |
"Traceback (most recent call last):\n", | |
"Exception in device=TPU:0: index 3 is out of bounds for dimension 0 with size 3\n", | |
"Traceback (most recent call last):\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n", | |
" _start_fn(index, pf_cfg, fn, args)\n", | |
"Traceback (most recent call last):\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n", | |
" _start_fn(index, pf_cfg, fn, args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n", | |
" _start_fn(index, pf_cfg, fn, args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n", | |
" fn(gindex, *args)\n", | |
"Traceback (most recent call last):\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n", | |
" fn(gindex, *args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n", | |
" _start_fn(index, pf_cfg, fn, args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n", | |
" fn(gindex, *args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n", | |
" fn(gindex, *args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n", | |
" _start_fn(index, pf_cfg, fn, args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n", | |
" fn(gindex, *args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n", | |
" fit_method(learner, **fit_args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n", | |
" fit_method(learner, **fit_args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n", | |
" fn(gindex, *args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n", | |
" fit_method(learner, **fit_args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 330, in _mp_start_fn\n", | |
" _start_fn(index, pf_cfg, fn, args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n", | |
" fit_method(learner, **fit_args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n", | |
" self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n", | |
" fit_method(learner, **fit_args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n", | |
" fit_method(learner, **fit_args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n", | |
" fn(gindex, *args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n", | |
" self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n", | |
" self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n", | |
" self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n", | |
" self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\", line 324, in _start_fn\n", | |
" fn(gindex, *args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n", | |
" fit_method(learner, **fit_args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n", | |
" self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n", | |
" self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n", | |
" self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n", | |
" self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n", | |
" self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai_xla_extensions/multi_core/learner.py\", line 56, in xla_run_method\n", | |
" fit_method(learner, **fit_args)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n", | |
" self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n", | |
" self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n", | |
" self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n", | |
" self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py\", line 112, in fit_one_cycle\n", | |
" self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n", | |
" self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n", | |
" self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n", | |
" self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n", | |
" self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 211, in fit\n", | |
" self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n", | |
" self._do_epoch_train()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n", | |
" self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n", | |
" self._do_epoch_train()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n", | |
" self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n", | |
" self._do_epoch_train()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n", | |
" self._with_events(self.all_batches, 'train', CancelTrainException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n", | |
" self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n", | |
" self._do_epoch_train()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n", | |
" self._with_events(self.all_batches, 'train', CancelTrainException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 202, in _do_fit\n", | |
" self._with_events(self._do_epoch, 'epoch', CancelEpochException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n", | |
" self._with_events(self.all_batches, 'train', CancelTrainException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n", | |
" self._do_epoch_train()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n", | |
" self._with_events(self.all_batches, 'train', CancelTrainException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n", | |
" self._do_epoch_train()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n", | |
" for o in enumerate(self.dl): self.one_batch(*o)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n", | |
" self._do_epoch_train()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n", | |
" self._with_events(self.all_batches, 'train', CancelTrainException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n", | |
" for o in enumerate(self.dl): self.one_batch(*o)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 196, in _do_epoch\n", | |
" self._do_epoch_train()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n", | |
" self._with_events(self.all_batches, 'train', CancelTrainException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n", | |
" self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n", | |
" self._with_events(self.all_batches, 'train', CancelTrainException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n", | |
" self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n", | |
" for o in enumerate(self.dl): self.one_batch(*o)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n", | |
" for o in enumerate(self.dl): self.one_batch(*o)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 188, in _do_epoch_train\n", | |
" self._with_events(self.all_batches, 'train', CancelTrainException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n", | |
" def __call__(self, event_name): L(event_name).map(self._call_one)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n", | |
" def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n", | |
" self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n", | |
" def __call__(self, event_name): L(event_name).map(self._call_one)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n", | |
" self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n", | |
" for o in enumerate(self.dl): self.one_batch(*o)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n", | |
" for o in enumerate(self.dl): self.one_batch(*o)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n", | |
" for o in enumerate(self.dl): self.one_batch(*o)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 166, in all_batches\n", | |
" for o in enumerate(self.dl): self.one_batch(*o)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n", | |
" def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n", | |
"IndexError: index 3 is out of bounds for dimension 0 with size 3\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n", | |
" self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n", | |
" self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n", | |
" self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n", | |
"IndexError: index 3 is out of bounds for dimension 0 with size 3\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n", | |
" def __call__(self, event_name): L(event_name).map(self._call_one)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n", | |
" def __call__(self, event_name): L(event_name).map(self._call_one)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 184, in one_batch\n", | |
" self._with_events(self._do_one_batch, 'batch', CancelBatchException)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n", | |
" def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n", | |
" def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n", | |
" def __call__(self, event_name): L(event_name).map(self._call_one)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 160, in _with_events\n", | |
" try: self(f'before_{event_type}'); f()\n", | |
"IndexError: index 3 is out of bounds for dimension 0 with size 3\n", | |
"IndexError: index 3 is out of bounds for dimension 0 with size 3\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n", | |
" def __call__(self, event_name): L(event_name).map(self._call_one)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n", | |
" def __call__(self, event_name): L(event_name).map(self._call_one)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n", | |
" def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/learner.py\", line 141, in __call__\n", | |
" def __call__(self, event_name): L(event_name).map(self._call_one)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n", | |
" def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n", | |
" def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastcore/foundation.py\", line 154, in map\n", | |
" def map(self, f, *args, gen=False, **kwargs): return self._new(map_ex(self, f, *args, gen=gen, **kwargs))\n", | |
"IndexError: index 3 is out of bounds for dimension 0 with size 3\n", | |
"Exception in thread Thread-8:\n", | |
"Traceback (most recent call last):\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 872, in _try_get_data\n", | |
" data = self._data_queue.get(timeout=timeout)\n", | |
" File \"/usr/lib/python3.6/multiprocessing/queues.py\", line 113, in get\n", | |
" return _ForkingPickler.loads(res)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch/multiprocessing/reductions.py\", line 282, in rebuild_storage_fd\n", | |
" fd = df.detach()\n", | |
" File \"/usr/lib/python3.6/multiprocessing/resource_sharer.py\", line 57, in detach\n", | |
" with _resource_sharer.get_connection(self._id) as conn:\n", | |
" File \"/usr/lib/python3.6/multiprocessing/resource_sharer.py\", line 87, in get_connection\n", | |
" c = Client(address, authkey=process.current_process().authkey)\n", | |
" File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 493, in Client\n", | |
" answer_challenge(c, authkey)\n", | |
" File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 732, in answer_challenge\n", | |
" message = connection.recv_bytes(256) # reject large message\n", | |
" File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 216, in recv_bytes\n", | |
" buf = self._recv_bytes(maxlength)\n", | |
" File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 407, in _recv_bytes\n", | |
" buf = self._recv(4)\n", | |
" File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 379, in _recv\n", | |
" chunk = read(handle, remaining)\n", | |
"ConnectionResetError: [Errno 104] Connection reset by peer\n", | |
"\n", | |
"The above exception was the direct cause of the following exception:\n", | |
"\n", | |
"Traceback (most recent call last):\n", | |
" File \"/usr/lib/python3.6/threading.py\", line 916, in _bootstrap_inner\n", | |
" self.run()\n", | |
" File \"/usr/lib/python3.6/threading.py\", line 864, in run\n", | |
" self._target(*self._args, **self._kwargs)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/parallel_loader.py\", line 141, in _loader_worker\n", | |
" _, data = next(data_iter)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/data/load.py\", line 101, in __iter__\n", | |
" for b in _loaders[self.fake_l.num_workers==0](self.fake_l):\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 435, in __next__\n", | |
" data = self._next_data()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 1068, in _next_data\n", | |
" idx, data = self._get_data()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 1034, in _get_data\n", | |
" success, data = self._try_get_data()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 885, in _try_get_data\n", | |
" raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str)) from e\n", | |
"RuntimeError: DataLoader worker (pid(s) 28531, 28543, 28576, 28675, 28687) exited unexpectedly\n", | |
"IndexError: index 3 is out of bounds for dimension 0 with size 3\n", | |
"IndexError: index 3 is out of bounds for dimension 0 with size 3\n", | |
"IndexError: index 3 is out of bounds for dimension 0 with size 3\n", | |
"\n", | |
"Exception in thread Thread-8:\n", | |
"Traceback (most recent call last):\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 872, in _try_get_data\n", | |
" data = self._data_queue.get(timeout=timeout)\n", | |
" File \"/usr/lib/python3.6/multiprocessing/queues.py\", line 113, in get\n", | |
" return _ForkingPickler.loads(res)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch/multiprocessing/reductions.py\", line 282, in rebuild_storage_fd\n", | |
" fd = df.detach()\n", | |
" File \"/usr/lib/python3.6/multiprocessing/resource_sharer.py\", line 57, in detach\n", | |
" with _resource_sharer.get_connection(self._id) as conn:\n", | |
" File \"/usr/lib/python3.6/multiprocessing/resource_sharer.py\", line 87, in get_connection\n", | |
" c = Client(address, authkey=process.current_process().authkey)\n", | |
" File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 487, in Client\n", | |
" c = SocketClient(address)\n", | |
" File \"/usr/lib/python3.6/multiprocessing/connection.py\", line 614, in SocketClient\n", | |
" s.connect(address)\n", | |
"ConnectionRefusedError: [Errno 111] Connection refused\n", | |
"\n", | |
"The above exception was the direct cause of the following exception:\n", | |
"\n", | |
"Traceback (most recent call last):\n", | |
" File \"/usr/lib/python3.6/threading.py\", line 916, in _bootstrap_inner\n", | |
" self.run()\n", | |
" File \"/usr/lib/python3.6/threading.py\", line 864, in run\n", | |
" self._target(*self._args, **self._kwargs)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/parallel_loader.py\", line 141, in _loader_worker\n", | |
" _, data = next(data_iter)\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/fastai/data/load.py\", line 101, in __iter__\n", | |
" for b in _loaders[self.fake_l.num_workers==0](self.fake_l):\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 435, in __next__\n", | |
" data = self._next_data()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 1068, in _next_data\n", | |
" idx, data = self._get_data()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 1034, in _get_data\n", | |
" success, data = self._try_get_data()\n", | |
" File \"/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py\", line 885, in _try_get_data\n", | |
" raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str)) from e\n", | |
"RuntimeError: DataLoader worker (pid(s) 28571, 28579, 28596, 28619, 28665, 28677, 28684, 28692, 28703, 28738, 28768, 28777, 28785, 28797, 28805) exited unexpectedly\n" | |
], | |
"name": "stderr" | |
}, | |
{ | |
"output_type": "error", | |
"ename": "Exception", | |
"evalue": "ignored", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mException\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-11-7ef8d746365d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mLearner\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_one_cycle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlearner_args\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0madd_args\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfit_args\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mctrl_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mnprocs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnum_cores\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m start_method='fork')\n\u001b[0m", | |
"\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch_xla/distributed/xla_multiprocessing.py\u001b[0m in \u001b[0;36mspawn\u001b[0;34m(fn, args, nprocs, join, daemon, start_method)\u001b[0m\n\u001b[1;32m 393\u001b[0m \u001b[0mjoin\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 394\u001b[0m \u001b[0mdaemon\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdaemon\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 395\u001b[0;31m start_method=start_method)\n\u001b[0m\u001b[1;32m 396\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 397\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/multiprocessing/spawn.py\u001b[0m in \u001b[0;36mstart_processes\u001b[0;34m(fn, args, nprocs, join, daemon, start_method)\u001b[0m\n\u001b[1;32m 155\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 156\u001b[0m \u001b[0;31m# Loop on join until it returns True or raises an exception.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 157\u001b[0;31m \u001b[0;32mwhile\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mcontext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 158\u001b[0m \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/multiprocessing/spawn.py\u001b[0m in \u001b[0;36mjoin\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 110\u001b[0m raise Exception(\n\u001b[1;32m 111\u001b[0m \u001b[0;34m\"process %d terminated with exit code %d\"\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 112\u001b[0;31m \u001b[0;34m(\u001b[0m\u001b[0merror_index\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexitcode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 113\u001b[0m )\n\u001b[1;32m 114\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mException\u001b[0m: process 7 terminated with exit code 17" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "66CdLIQQQdE6" | |
}, | |
"source": [ | |
"# learner.xla_fit_one_cycle(2, lr_max=slice(1e-4), num_cores=8)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "LcPCdD_Qgwtu" | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment