Last active
May 9, 2023 09:10
-
-
Save muellerzr/72155ad00fd83c20dab9173a5ce8b79b to your computer and use it in GitHub Desktop.
Intel XPU issue T4 Colab
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/muellerzr/72155ad00fd83c20dab9173a5ce8b79b/scratchpad.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!nvidia-smi" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "hfi0P8MKwPdp", | |
"outputId": "0d58d853-805c-4509-9c17-955289e50ba0" | |
}, | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Tue May 9 09:04:25 2023 \n", | |
"+-----------------------------------------------------------------------------+\n", | |
"| NVIDIA-SMI 525.85.12 Driver Version: 525.85.12 CUDA Version: 12.0 |\n", | |
"|-------------------------------+----------------------+----------------------+\n", | |
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", | |
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", | |
"| | | MIG M. |\n", | |
"|===============================+======================+======================|\n", | |
"| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", | |
"| N/A 54C P8 10W / 70W | 0MiB / 15360MiB | 0% Default |\n", | |
"| | | N/A |\n", | |
"+-------------------------------+----------------------+----------------------+\n", | |
" \n", | |
"+-----------------------------------------------------------------------------+\n", | |
"| Processes: |\n", | |
"| GPU GI CI PID Type Process name GPU Memory |\n", | |
"| ID ID Usage |\n", | |
"|=============================================================================|\n", | |
"| No running processes found |\n", | |
"+-----------------------------------------------------------------------------+\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "lIYdn1woOS1n" | |
}, | |
"outputs": [], | |
"source": [ | |
"!git clone https://github.com/abhilash1910/accelerate\n", | |
"%cd accelerate\n", | |
"!pip install -e .[testing]\n", | |
"!pip install pytest" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!wget https://gist.githubusercontent.com/muellerzr/d1e03af04d9e01cf118869b047ab492d/raw/3ac01a30eff4d64e8d9693b4600660ac160a42ec/checkpointing.py -O examples/by_feature/checkpointing.py" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Z35ua98fwWiw", | |
"outputId": "9f948d79-4709-4462-cd6b-b9f75ac83ea5" | |
}, | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"--2023-05-09 09:06:33-- https://gist.githubusercontent.com/muellerzr/d1e03af04d9e01cf118869b047ab492d/raw/3ac01a30eff4d64e8d9693b4600660ac160a42ec/checkpointing.py\n", | |
"Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", | |
"Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.108.133|:443... connected.\n", | |
"HTTP request sent, awaiting response... 200 OK\n", | |
"Length: 13437 (13K) [text/plain]\n", | |
"Saving to: ‘examples/by_feature/checkpointing.py’\n", | |
"\n", | |
"\r examples/ 0%[ ] 0 --.-KB/s \rexamples/by_feature 100%[===================>] 13.12K --.-KB/s in 0s \n", | |
"\n", | |
"2023-05-09 09:06:33 (103 MB/s) - ‘examples/by_feature/checkpointing.py’ saved [13437/13437]\n", | |
"\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!pytest -sv tests/test_examples.py::FeatureExamplesTests::test_checkpointing_by_steps" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "3J8ackkjwyf4", | |
"outputId": "d3d16448-320b-4f1c-c1c1-d8915b6dd440" | |
}, | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"\u001b[1m============================= test session starts ==============================\u001b[0m\n", | |
"platform linux -- Python 3.10.11, pytest-7.2.2, pluggy-1.0.0 -- /usr/bin/python3\n", | |
"cachedir: .pytest_cache\n", | |
"rootdir: /content/accelerate\n", | |
"plugins: xdist-3.2.1, subtests-0.10.0, anyio-3.6.2\n", | |
"\u001b[1mcollecting ... \u001b[0m2023-05-09 09:09:06.165794: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", | |
"collected 1 item \u001b[0m\n", | |
"\n", | |
"tests/test_examples.py::FeatureExamplesTests::test_checkpointing_by_steps \u001b[31mFAILED\u001b[0m\n", | |
"\n", | |
"=================================== FAILURES ===================================\n", | |
"\u001b[31m\u001b[1m_______________ FeatureExamplesTests.test_checkpointing_by_steps _______________\u001b[0m\n", | |
"\n", | |
"command = ['accelerate', 'launch', '--config_file', '/tmp/tmp_s5b6aj5/default_config.yml', 'examples/by_feature/checkpointing.py', '--checkpointing_steps', ...]\n", | |
"return_stdout = False\n", | |
"\n", | |
" \u001b[94mdef\u001b[39;49;00m \u001b[92mrun_command\u001b[39;49;00m(command: List[\u001b[96mstr\u001b[39;49;00m], return_stdout=\u001b[94mFalse\u001b[39;49;00m):\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[90m \u001b[39;49;00m\u001b[33m\"\"\"\u001b[39;49;00m\n", | |
" \u001b[33m Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture\u001b[39;49;00m\n", | |
" \u001b[33m if an error occured while running `command`\u001b[39;49;00m\n", | |
" \u001b[33m \"\"\"\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mtry\u001b[39;49;00m:\u001b[90m\u001b[39;49;00m\n", | |
"> output = subprocess.check_output(command, stderr=subprocess.STDOUT)\u001b[90m\u001b[39;49;00m\n", | |
"\n", | |
"\u001b[1m\u001b[31msrc/accelerate/test_utils/testing.py\u001b[0m:401: \n", | |
"_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n", | |
"\n", | |
"timeout = None\n", | |
"popenargs = (['accelerate', 'launch', '--config_file', '/tmp/tmp_s5b6aj5/default_config.yml', 'examples/by_feature/checkpointing.py', '--checkpointing_steps', ...],)\n", | |
"kwargs = {'stderr': -2}\n", | |
"\n", | |
" \u001b[94mdef\u001b[39;49;00m \u001b[92mcheck_output\u001b[39;49;00m(*popenargs, timeout=\u001b[94mNone\u001b[39;49;00m, **kwargs):\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[90m \u001b[39;49;00m\u001b[33mr\u001b[39;49;00m\u001b[33m\"\"\"Run command with arguments and return its output.\u001b[39;49;00m\n", | |
" \u001b[33m\u001b[39;49;00m\n", | |
" \u001b[33m If the exit code was non-zero it raises a CalledProcessError. The\u001b[39;49;00m\n", | |
" \u001b[33m CalledProcessError object will have the return code in the returncode\u001b[39;49;00m\n", | |
" \u001b[33m attribute and output in the output attribute.\u001b[39;49;00m\n", | |
" \u001b[33m\u001b[39;49;00m\n", | |
" \u001b[33m The arguments are the same as for the Popen constructor. Example:\u001b[39;49;00m\n", | |
" \u001b[33m\u001b[39;49;00m\n", | |
" \u001b[33m >>> check_output([\"ls\", \"-l\", \"/dev/null\"])\u001b[39;49;00m\n", | |
" \u001b[33m b'crw-rw-rw- 1 root root 1, 3 Oct 18 2007 /dev/null\\n'\u001b[39;49;00m\n", | |
" \u001b[33m\u001b[39;49;00m\n", | |
" \u001b[33m The stdout argument is not allowed as it is used internally.\u001b[39;49;00m\n", | |
" \u001b[33m To capture standard error in the result, use stderr=STDOUT.\u001b[39;49;00m\n", | |
" \u001b[33m\u001b[39;49;00m\n", | |
" \u001b[33m >>> check_output([\"/bin/sh\", \"-c\",\u001b[39;49;00m\n", | |
" \u001b[33m ... \"ls -l non_existent_file ; exit 0\"],\u001b[39;49;00m\n", | |
" \u001b[33m ... stderr=STDOUT)\u001b[39;49;00m\n", | |
" \u001b[33m b'ls: non_existent_file: No such file or directory\\n'\u001b[39;49;00m\n", | |
" \u001b[33m\u001b[39;49;00m\n", | |
" \u001b[33m There is an additional optional argument, \"input\", allowing you to\u001b[39;49;00m\n", | |
" \u001b[33m pass a string to the subprocess's stdin. If you use this argument\u001b[39;49;00m\n", | |
" \u001b[33m you may not also use the Popen constructor's \"stdin\" argument, as\u001b[39;49;00m\n", | |
" \u001b[33m it too will be used internally. Example:\u001b[39;49;00m\n", | |
" \u001b[33m\u001b[39;49;00m\n", | |
" \u001b[33m >>> check_output([\"sed\", \"-e\", \"s/foo/bar/\"],\u001b[39;49;00m\n", | |
" \u001b[33m ... input=b\"when in the course of fooman events\\n\")\u001b[39;49;00m\n", | |
" \u001b[33m b'when in the course of barman events\\n'\u001b[39;49;00m\n", | |
" \u001b[33m\u001b[39;49;00m\n", | |
" \u001b[33m By default, all communication is in bytes, and therefore any \"input\"\u001b[39;49;00m\n", | |
" \u001b[33m should be bytes, and the return value will be bytes. If in text mode,\u001b[39;49;00m\n", | |
" \u001b[33m any \"input\" should be a string, and the return value will be a string\u001b[39;49;00m\n", | |
" \u001b[33m decoded according to locale encoding, or by \"encoding\" if set. Text mode\u001b[39;49;00m\n", | |
" \u001b[33m is triggered by setting any of text, encoding, errors or universal_newlines.\u001b[39;49;00m\n", | |
" \u001b[33m \"\"\"\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mif\u001b[39;49;00m \u001b[33m'\u001b[39;49;00m\u001b[33mstdout\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m \u001b[95min\u001b[39;49;00m kwargs:\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mraise\u001b[39;49;00m \u001b[96mValueError\u001b[39;49;00m(\u001b[33m'\u001b[39;49;00m\u001b[33mstdout argument not allowed, it will be overridden.\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mif\u001b[39;49;00m \u001b[33m'\u001b[39;49;00m\u001b[33minput\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m \u001b[95min\u001b[39;49;00m kwargs \u001b[95mand\u001b[39;49;00m kwargs[\u001b[33m'\u001b[39;49;00m\u001b[33minput\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m] \u001b[95mis\u001b[39;49;00m \u001b[94mNone\u001b[39;49;00m:\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[90m# Explicitly passing input=None was previously equivalent to passing an\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[90m# empty string. That is maintained here for backwards compatibility.\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mif\u001b[39;49;00m kwargs.get(\u001b[33m'\u001b[39;49;00m\u001b[33muniversal_newlines\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m) \u001b[95mor\u001b[39;49;00m kwargs.get(\u001b[33m'\u001b[39;49;00m\u001b[33mtext\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m) \u001b[95mor\u001b[39;49;00m kwargs.get(\u001b[33m'\u001b[39;49;00m\u001b[33mencoding\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m) \\\n", | |
" \u001b[95mor\u001b[39;49;00m kwargs.get(\u001b[33m'\u001b[39;49;00m\u001b[33merrors\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m):\u001b[90m\u001b[39;49;00m\n", | |
" empty = \u001b[33m'\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94melse\u001b[39;49;00m:\u001b[90m\u001b[39;49;00m\n", | |
" empty = \u001b[33mb\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" kwargs[\u001b[33m'\u001b[39;49;00m\u001b[33minput\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m] = empty\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[90m\u001b[39;49;00m\n", | |
"> \u001b[94mreturn\u001b[39;49;00m run(*popenargs, stdout=PIPE, timeout=timeout, check=\u001b[94mTrue\u001b[39;49;00m,\u001b[90m\u001b[39;49;00m\n", | |
" **kwargs).stdout\u001b[90m\u001b[39;49;00m\n", | |
"\n", | |
"\u001b[1m\u001b[31m/usr/lib/python3.10/subprocess.py\u001b[0m:421: \n", | |
"_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n", | |
"\n", | |
"input = None, capture_output = False, timeout = None, check = True\n", | |
"popenargs = (['accelerate', 'launch', '--config_file', '/tmp/tmp_s5b6aj5/default_config.yml', 'examples/by_feature/checkpointing.py', '--checkpointing_steps', ...],)\n", | |
"kwargs = {'stderr': -2, 'stdout': -1}\n", | |
"process = <Popen: returncode: 1 args: ['accelerate', 'launch', '--config_file', '/tmp/...>\n", | |
"stdout = b'2023-05-09 09:09:11.875999: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find T...py\\', \\'--checkpointing_steps\\', \\'1\\', \\n\\'--output_dir\\', \\'/tmp/tmpg9wnngc2\\']\\' returned non-zero exit status 1.\\n'\n", | |
"stderr = None, retcode = 1\n", | |
"\n", | |
" \u001b[94mdef\u001b[39;49;00m \u001b[92mrun\u001b[39;49;00m(*popenargs,\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[96minput\u001b[39;49;00m=\u001b[94mNone\u001b[39;49;00m, capture_output=\u001b[94mFalse\u001b[39;49;00m, timeout=\u001b[94mNone\u001b[39;49;00m, check=\u001b[94mFalse\u001b[39;49;00m, **kwargs):\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[90m \u001b[39;49;00m\u001b[33m\"\"\"Run command with arguments and return a CompletedProcess instance.\u001b[39;49;00m\n", | |
" \u001b[33m\u001b[39;49;00m\n", | |
" \u001b[33m The returned instance will have attributes args, returncode, stdout and\u001b[39;49;00m\n", | |
" \u001b[33m stderr. By default, stdout and stderr are not captured, and those attributes\u001b[39;49;00m\n", | |
" \u001b[33m will be None. Pass stdout=PIPE and/or stderr=PIPE in order to capture them,\u001b[39;49;00m\n", | |
" \u001b[33m or pass capture_output=True to capture both.\u001b[39;49;00m\n", | |
" \u001b[33m\u001b[39;49;00m\n", | |
" \u001b[33m If check is True and the exit code was non-zero, it raises a\u001b[39;49;00m\n", | |
" \u001b[33m CalledProcessError. The CalledProcessError object will have the return code\u001b[39;49;00m\n", | |
" \u001b[33m in the returncode attribute, and output & stderr attributes if those streams\u001b[39;49;00m\n", | |
" \u001b[33m were captured.\u001b[39;49;00m\n", | |
" \u001b[33m\u001b[39;49;00m\n", | |
" \u001b[33m If timeout is given, and the process takes too long, a TimeoutExpired\u001b[39;49;00m\n", | |
" \u001b[33m exception will be raised.\u001b[39;49;00m\n", | |
" \u001b[33m\u001b[39;49;00m\n", | |
" \u001b[33m There is an optional argument \"input\", allowing you to\u001b[39;49;00m\n", | |
" \u001b[33m pass bytes or a string to the subprocess's stdin. If you use this argument\u001b[39;49;00m\n", | |
" \u001b[33m you may not also use the Popen constructor's \"stdin\" argument, as\u001b[39;49;00m\n", | |
" \u001b[33m it will be used internally.\u001b[39;49;00m\n", | |
" \u001b[33m\u001b[39;49;00m\n", | |
" \u001b[33m By default, all communication is in bytes, and therefore any \"input\" should\u001b[39;49;00m\n", | |
" \u001b[33m be bytes, and the stdout and stderr will be bytes. If in text mode, any\u001b[39;49;00m\n", | |
" \u001b[33m \"input\" should be a string, and stdout and stderr will be strings decoded\u001b[39;49;00m\n", | |
" \u001b[33m according to locale encoding, or by \"encoding\" if set. Text mode is\u001b[39;49;00m\n", | |
" \u001b[33m triggered by setting any of text, encoding, errors or universal_newlines.\u001b[39;49;00m\n", | |
" \u001b[33m\u001b[39;49;00m\n", | |
" \u001b[33m The other arguments are the same as for the Popen constructor.\u001b[39;49;00m\n", | |
" \u001b[33m \"\"\"\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mif\u001b[39;49;00m \u001b[96minput\u001b[39;49;00m \u001b[95mis\u001b[39;49;00m \u001b[95mnot\u001b[39;49;00m \u001b[94mNone\u001b[39;49;00m:\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mif\u001b[39;49;00m kwargs.get(\u001b[33m'\u001b[39;49;00m\u001b[33mstdin\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m) \u001b[95mis\u001b[39;49;00m \u001b[95mnot\u001b[39;49;00m \u001b[94mNone\u001b[39;49;00m:\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mraise\u001b[39;49;00m \u001b[96mValueError\u001b[39;49;00m(\u001b[33m'\u001b[39;49;00m\u001b[33mstdin and input arguments may not both be used.\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\u001b[90m\u001b[39;49;00m\n", | |
" kwargs[\u001b[33m'\u001b[39;49;00m\u001b[33mstdin\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m] = PIPE\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mif\u001b[39;49;00m capture_output:\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mif\u001b[39;49;00m kwargs.get(\u001b[33m'\u001b[39;49;00m\u001b[33mstdout\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m) \u001b[95mis\u001b[39;49;00m \u001b[95mnot\u001b[39;49;00m \u001b[94mNone\u001b[39;49;00m \u001b[95mor\u001b[39;49;00m kwargs.get(\u001b[33m'\u001b[39;49;00m\u001b[33mstderr\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m) \u001b[95mis\u001b[39;49;00m \u001b[95mnot\u001b[39;49;00m \u001b[94mNone\u001b[39;49;00m:\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mraise\u001b[39;49;00m \u001b[96mValueError\u001b[39;49;00m(\u001b[33m'\u001b[39;49;00m\u001b[33mstdout and stderr arguments may not be used \u001b[39;49;00m\u001b[33m'\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[33m'\u001b[39;49;00m\u001b[33mwith capture_output.\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\u001b[90m\u001b[39;49;00m\n", | |
" kwargs[\u001b[33m'\u001b[39;49;00m\u001b[33mstdout\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m] = PIPE\u001b[90m\u001b[39;49;00m\n", | |
" kwargs[\u001b[33m'\u001b[39;49;00m\u001b[33mstderr\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m] = PIPE\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mwith\u001b[39;49;00m Popen(*popenargs, **kwargs) \u001b[94mas\u001b[39;49;00m process:\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mtry\u001b[39;49;00m:\u001b[90m\u001b[39;49;00m\n", | |
" stdout, stderr = process.communicate(\u001b[96minput\u001b[39;49;00m, timeout=timeout)\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mexcept\u001b[39;49;00m TimeoutExpired \u001b[94mas\u001b[39;49;00m exc:\u001b[90m\u001b[39;49;00m\n", | |
" process.kill()\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mif\u001b[39;49;00m _mswindows:\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[90m# Windows accumulates the output in a single blocking\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[90m# read() call run on child threads, with the timeout\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[90m# being done in a join() on those threads. communicate()\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[90m# _after_ kill() is required to collect that and add it\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[90m# to the exception.\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" exc.stdout, exc.stderr = process.communicate()\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94melse\u001b[39;49;00m:\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[90m# POSIX _communicate already populated the output so\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[90m# far into the TimeoutExpired exception.\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" process.wait()\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mraise\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mexcept\u001b[39;49;00m: \u001b[90m# Including KeyboardInterrupt, communicate handled that.\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" process.kill()\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[90m# We don't call process.wait() as .__exit__ does that for us.\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mraise\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" retcode = process.poll()\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mif\u001b[39;49;00m check \u001b[95mand\u001b[39;49;00m retcode:\u001b[90m\u001b[39;49;00m\n", | |
"> \u001b[94mraise\u001b[39;49;00m CalledProcessError(retcode, process.args,\u001b[90m\u001b[39;49;00m\n", | |
" output=stdout, stderr=stderr)\u001b[90m\u001b[39;49;00m\n", | |
"\u001b[1m\u001b[31mE subprocess.CalledProcessError: Command '['accelerate', 'launch', '--config_file', '/tmp/tmp_s5b6aj5/default_config.yml', 'examples/by_feature/checkpointing.py', '--checkpointing_steps', '1', '--output_dir', '/tmp/tmpg9wnngc2']' returned non-zero exit status 1.\u001b[0m\n", | |
"\n", | |
"\u001b[1m\u001b[31m/usr/lib/python3.10/subprocess.py\u001b[0m:526: CalledProcessError\n", | |
"\n", | |
"\u001b[33mThe above exception was the direct cause of the following exception:\u001b[0m\n", | |
"\n", | |
"self = <test_examples.FeatureExamplesTests testMethod=test_checkpointing_by_steps>\n", | |
"\n", | |
" \u001b[94mdef\u001b[39;49;00m \u001b[92mtest_checkpointing_by_steps\u001b[39;49;00m(\u001b[96mself\u001b[39;49;00m):\u001b[90m\u001b[39;49;00m\n", | |
" testargs = \u001b[33mf\u001b[39;49;00m\u001b[33m\"\"\"\u001b[39;49;00m\u001b[33m\u001b[39;49;00m\n", | |
" \u001b[33m examples/by_feature/checkpointing.py\u001b[39;49;00m\u001b[33m\u001b[39;49;00m\n", | |
" \u001b[33m --checkpointing_steps 1\u001b[39;49;00m\u001b[33m\u001b[39;49;00m\n", | |
" \u001b[33m --output_dir \u001b[39;49;00m\u001b[33m{\u001b[39;49;00m\u001b[96mself\u001b[39;49;00m.tmpdir\u001b[33m}\u001b[39;49;00m\u001b[33m\u001b[39;49;00m\n", | |
" \u001b[33m \u001b[39;49;00m\u001b[33m\"\"\"\u001b[39;49;00m.split()\u001b[90m\u001b[39;49;00m\n", | |
"> _ = run_command(\u001b[96mself\u001b[39;49;00m._launch_args + testargs)\u001b[90m\u001b[39;49;00m\n", | |
"\n", | |
"\u001b[1m\u001b[31mtests/test_examples.py\u001b[0m:158: \n", | |
"_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n", | |
"\n", | |
"command = ['accelerate', 'launch', '--config_file', '/tmp/tmp_s5b6aj5/default_config.yml', 'examples/by_feature/checkpointing.py', '--checkpointing_steps', ...]\n", | |
"return_stdout = False\n", | |
"\n", | |
" \u001b[94mdef\u001b[39;49;00m \u001b[92mrun_command\u001b[39;49;00m(command: List[\u001b[96mstr\u001b[39;49;00m], return_stdout=\u001b[94mFalse\u001b[39;49;00m):\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[90m \u001b[39;49;00m\u001b[33m\"\"\"\u001b[39;49;00m\n", | |
" \u001b[33m Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture\u001b[39;49;00m\n", | |
" \u001b[33m if an error occured while running `command`\u001b[39;49;00m\n", | |
" \u001b[33m \"\"\"\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mtry\u001b[39;49;00m:\u001b[90m\u001b[39;49;00m\n", | |
" output = subprocess.check_output(command, stderr=subprocess.STDOUT)\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mif\u001b[39;49;00m return_stdout:\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mif\u001b[39;49;00m \u001b[96mhasattr\u001b[39;49;00m(output, \u001b[33m\"\u001b[39;49;00m\u001b[33mdecode\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m):\u001b[90m\u001b[39;49;00m\n", | |
" output = output.decode(\u001b[33m\"\u001b[39;49;00m\u001b[33mutf-8\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mreturn\u001b[39;49;00m output\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[94mexcept\u001b[39;49;00m subprocess.CalledProcessError \u001b[94mas\u001b[39;49;00m e:\u001b[90m\u001b[39;49;00m\n", | |
"> \u001b[94mraise\u001b[39;49;00m SubprocessCallException(\u001b[90m\u001b[39;49;00m\n", | |
" \u001b[33mf\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m\u001b[33mCommand `\u001b[39;49;00m\u001b[33m{\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m\u001b[33m \u001b[39;49;00m\u001b[33m'\u001b[39;49;00m.join(command)\u001b[33m}\u001b[39;49;00m\u001b[33m` failed with the following error:\u001b[39;49;00m\u001b[33m\\n\u001b[39;49;00m\u001b[33m\\n\u001b[39;49;00m\u001b[33m{\u001b[39;49;00me.output.decode()\u001b[33m}\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
" ) \u001b[94mfrom\u001b[39;49;00m \u001b[04m\u001b[96me\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n", | |
"\u001b[1m\u001b[31mE accelerate.test_utils.testing.SubprocessCallException: Command `accelerate launch --config_file /tmp/tmp_s5b6aj5/default_config.yml examples/by_feature/checkpointing.py --checkpointing_steps 1 --output_dir /tmp/tmpg9wnngc2` failed with the following error:\u001b[0m\n", | |
"\u001b[1m\u001b[31mE \u001b[0m\n", | |
"\u001b[1m\u001b[31mE 2023-05-09 09:09:11.875999: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\u001b[0m\n", | |
"\u001b[1m\u001b[31mE [09:09:12] WARNING The following values were not passed to launch.py:900\u001b[0m\n", | |
"\u001b[1m\u001b[31mE `accelerate launch` and had defaults used \u001b[0m\n", | |
"\u001b[1m\u001b[31mE instead: \u001b[0m\n", | |
"\u001b[1m\u001b[31mE `--num_cpu_threads_per_process` was \u001b[0m\n", | |
"\u001b[1m\u001b[31mE set to `4` to improve out-of-box performance \u001b[0m\n", | |
"\u001b[1m\u001b[31mE when training on CPUs \u001b[0m\n", | |
"\u001b[1m\u001b[31mE To avoid this warning pass in values for each \u001b[0m\n", | |
"\u001b[1m\u001b[31mE of the problematic parameters or run \u001b[0m\n", | |
"\u001b[1m\u001b[31mE `accelerate config`. \u001b[0m\n", | |
"\u001b[1m\u001b[31mE 2023-05-09 09:09:16.937243: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\u001b[0m\n", | |
"\u001b[1m\u001b[31mE ╭───────────────────── Traceback (most recent call last) ──────────────────────╮\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ /content/accelerate/examples/by_feature/checkpointing.py:316 in <module> │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 313 │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 314 │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 315 if __name__ == \"__main__\": │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ ❱ 316 │ main() │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 317 │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ /content/accelerate/examples/by_feature/checkpointing.py:312 in main │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 309 │ ) │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 310 │ args = parser.parse_args() │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 311 │ config = {\"lr\": 2e-5, \"num_epochs\": 3, \"seed\": 42, \"batch_size\": 1 │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ ❱ 312 │ training_function(config, args) │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 313 │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 314 │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 315 if __name__ == \"__main__\": │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ /content/accelerate/examples/by_feature/checkpointing.py:130 in │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ training_function │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 127 │ │ config[\"num_epochs\"] = 2 │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 128 │ # Initialize accelerator │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 129 │ accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ ❱ 130 │ assert accelerator.device.type == \"cuda\", f'Device: {accelerator.d │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 131 │ # Sample hyper-parameters for learning rate, batch size, seed and │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 132 │ lr = config[\"lr\"] │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 133 │ num_epochs = int(config[\"num_epochs\"]) │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE ╰──────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", | |
"\u001b[1m\u001b[31mE AssertionError: Device: cpu, type: cpu\u001b[0m\n", | |
"\u001b[1m\u001b[31mE ╭───────────────────── Traceback (most recent call last) ──────────────────────╮\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ /usr/local/bin/accelerate:8 in <module> │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 5 from accelerate.commands.accelerate_cli import main │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 6 if __name__ == '__main__': │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 7 │ sys.argv[0] = re.sub(r'(-script\\.pyw|\\.exe)?$', '', sys.argv[0]) │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ ❱ 8 │ sys.exit(main()) │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 9 │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ /content/accelerate/src/accelerate/commands/accelerate_cli.py:45 in main │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 42 │ │ exit(1) │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 43 │ │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 44 │ # Run │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ ❱ 45 │ args.func(args) │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 46 │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 47 │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 48 if __name__ == \"__main__\": │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ /content/accelerate/src/accelerate/commands/launch.py:928 in launch_command │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 925 │ elif defaults is not None and defaults.compute_environment == Comp │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 926 │ │ sagemaker_launcher(defaults, args) │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 927 │ else: │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ ❱ 928 │ │ simple_launcher(args) │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 929 │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 930 │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 931 def main(): │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ /content/accelerate/src/accelerate/commands/launch.py:588 in simple_launcher │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 585 │ process.wait() │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 586 │ if process.returncode != 0: │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 587 │ │ if not args.quiet: │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ ❱ 588 │ │ │ raise subprocess.CalledProcessError(returncode=process.ret │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 589 │ │ else: │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 590 │ │ │ sys.exit(1) │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE │ 591 │\u001b[0m\n", | |
"\u001b[1m\u001b[31mE ╰──────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", | |
"\u001b[1m\u001b[31mE CalledProcessError: Command '['/usr/bin/python3', \u001b[0m\n", | |
"\u001b[1m\u001b[31mE 'examples/by_feature/checkpointing.py', '--checkpointing_steps', '1', \u001b[0m\n", | |
"\u001b[1m\u001b[31mE '--output_dir', '/tmp/tmpg9wnngc2']' returned non-zero exit status 1.\u001b[0m\n", | |
"\n", | |
"\u001b[1m\u001b[31msrc/accelerate/test_utils/testing.py\u001b[0m:407: SubprocessCallException\n", | |
"\u001b[36m\u001b[1m=========================== short test summary info ============================\u001b[0m\n", | |
"\u001b[31mFAILED\u001b[0m tests/test_examples.py::\u001b[1mFeatureExamplesTests::test_checkpointing_by_steps\u001b[0m - accelerate.test_utils.testing.SubprocessCallException: Command `accelerate ...\n", | |
"\u001b[31m============================== \u001b[31m\u001b[1m1 failed\u001b[0m\u001b[31m in 22.06s\u001b[0m\u001b[31m ==============================\u001b[0m\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [], | |
"metadata": { | |
"id": "kQUP_EBRw-wc" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"colab": { | |
"name": "scratchpad", | |
"provenance": [], | |
"machine_shape": "hm", | |
"gpuType": "T4", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"name": "python3" | |
}, | |
"accelerator": "GPU", | |
"gpuClass": "standard" | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment