Skip to content

Instantly share code, notes, and snippets.

@muellerzr
Last active May 9, 2023 09:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save muellerzr/72155ad00fd83c20dab9173a5ce8b79b to your computer and use it in GitHub Desktop.
Save muellerzr/72155ad00fd83c20dab9173a5ce8b79b to your computer and use it in GitHub Desktop.
Intel XPU issue T4 Colab
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/muellerzr/72155ad00fd83c20dab9173a5ce8b79b/scratchpad.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"source": [
"!nvidia-smi"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "hfi0P8MKwPdp",
"outputId": "0d58d853-805c-4509-9c17-955289e50ba0"
},
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Tue May 9 09:04:25 2023 \n",
"+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 525.85.12 Driver Version: 525.85.12 CUDA Version: 12.0 |\n",
"|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|===============================+======================+======================|\n",
"| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n",
"| N/A 54C P8 10W / 70W | 0MiB / 15360MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=============================================================================|\n",
"| No running processes found |\n",
"+-----------------------------------------------------------------------------+\n"
]
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "lIYdn1woOS1n"
},
"outputs": [],
"source": [
"!git clone https://github.com/abhilash1910/accelerate\n",
"%cd accelerate\n",
"!pip install -e .[testing]\n",
"!pip install pytest"
]
},
{
"cell_type": "code",
"source": [
"!wget https://gist.githubusercontent.com/muellerzr/d1e03af04d9e01cf118869b047ab492d/raw/3ac01a30eff4d64e8d9693b4600660ac160a42ec/checkpointing.py -O examples/by_feature/checkpointing.py"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Z35ua98fwWiw",
"outputId": "9f948d79-4709-4462-cd6b-b9f75ac83ea5"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"--2023-05-09 09:06:33-- https://gist.githubusercontent.com/muellerzr/d1e03af04d9e01cf118869b047ab492d/raw/3ac01a30eff4d64e8d9693b4600660ac160a42ec/checkpointing.py\n",
"Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
"Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.108.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 13437 (13K) [text/plain]\n",
"Saving to: ‘examples/by_feature/checkpointing.py’\n",
"\n",
"\r examples/ 0%[ ] 0 --.-KB/s \rexamples/by_feature 100%[===================>] 13.12K --.-KB/s in 0s \n",
"\n",
"2023-05-09 09:06:33 (103 MB/s) - ‘examples/by_feature/checkpointing.py’ saved [13437/13437]\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!pytest -sv tests/test_examples.py::FeatureExamplesTests::test_checkpointing_by_steps"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3J8ackkjwyf4",
"outputId": "d3d16448-320b-4f1c-c1c1-d8915b6dd440"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\u001b[1m============================= test session starts ==============================\u001b[0m\n",
"platform linux -- Python 3.10.11, pytest-7.2.2, pluggy-1.0.0 -- /usr/bin/python3\n",
"cachedir: .pytest_cache\n",
"rootdir: /content/accelerate\n",
"plugins: xdist-3.2.1, subtests-0.10.0, anyio-3.6.2\n",
"\u001b[1mcollecting ... \u001b[0m2023-05-09 09:09:06.165794: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
"collected 1 item \u001b[0m\n",
"\n",
"tests/test_examples.py::FeatureExamplesTests::test_checkpointing_by_steps \u001b[31mFAILED\u001b[0m\n",
"\n",
"=================================== FAILURES ===================================\n",
"\u001b[31m\u001b[1m_______________ FeatureExamplesTests.test_checkpointing_by_steps _______________\u001b[0m\n",
"\n",
"command = ['accelerate', 'launch', '--config_file', '/tmp/tmp_s5b6aj5/default_config.yml', 'examples/by_feature/checkpointing.py', '--checkpointing_steps', ...]\n",
"return_stdout = False\n",
"\n",
" \u001b[94mdef\u001b[39;49;00m \u001b[92mrun_command\u001b[39;49;00m(command: List[\u001b[96mstr\u001b[39;49;00m], return_stdout=\u001b[94mFalse\u001b[39;49;00m):\u001b[90m\u001b[39;49;00m\n",
" \u001b[90m \u001b[39;49;00m\u001b[33m\"\"\"\u001b[39;49;00m\n",
" \u001b[33m Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture\u001b[39;49;00m\n",
" \u001b[33m if an error occured while running `command`\u001b[39;49;00m\n",
" \u001b[33m \"\"\"\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mtry\u001b[39;49;00m:\u001b[90m\u001b[39;49;00m\n",
"> output = subprocess.check_output(command, stderr=subprocess.STDOUT)\u001b[90m\u001b[39;49;00m\n",
"\n",
"\u001b[1m\u001b[31msrc/accelerate/test_utils/testing.py\u001b[0m:401: \n",
"_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n",
"\n",
"timeout = None\n",
"popenargs = (['accelerate', 'launch', '--config_file', '/tmp/tmp_s5b6aj5/default_config.yml', 'examples/by_feature/checkpointing.py', '--checkpointing_steps', ...],)\n",
"kwargs = {'stderr': -2}\n",
"\n",
" \u001b[94mdef\u001b[39;49;00m \u001b[92mcheck_output\u001b[39;49;00m(*popenargs, timeout=\u001b[94mNone\u001b[39;49;00m, **kwargs):\u001b[90m\u001b[39;49;00m\n",
" \u001b[90m \u001b[39;49;00m\u001b[33mr\u001b[39;49;00m\u001b[33m\"\"\"Run command with arguments and return its output.\u001b[39;49;00m\n",
" \u001b[33m\u001b[39;49;00m\n",
" \u001b[33m If the exit code was non-zero it raises a CalledProcessError. The\u001b[39;49;00m\n",
" \u001b[33m CalledProcessError object will have the return code in the returncode\u001b[39;49;00m\n",
" \u001b[33m attribute and output in the output attribute.\u001b[39;49;00m\n",
" \u001b[33m\u001b[39;49;00m\n",
" \u001b[33m The arguments are the same as for the Popen constructor. Example:\u001b[39;49;00m\n",
" \u001b[33m\u001b[39;49;00m\n",
" \u001b[33m >>> check_output([\"ls\", \"-l\", \"/dev/null\"])\u001b[39;49;00m\n",
" \u001b[33m b'crw-rw-rw- 1 root root 1, 3 Oct 18 2007 /dev/null\\n'\u001b[39;49;00m\n",
" \u001b[33m\u001b[39;49;00m\n",
" \u001b[33m The stdout argument is not allowed as it is used internally.\u001b[39;49;00m\n",
" \u001b[33m To capture standard error in the result, use stderr=STDOUT.\u001b[39;49;00m\n",
" \u001b[33m\u001b[39;49;00m\n",
" \u001b[33m >>> check_output([\"/bin/sh\", \"-c\",\u001b[39;49;00m\n",
" \u001b[33m ... \"ls -l non_existent_file ; exit 0\"],\u001b[39;49;00m\n",
" \u001b[33m ... stderr=STDOUT)\u001b[39;49;00m\n",
" \u001b[33m b'ls: non_existent_file: No such file or directory\\n'\u001b[39;49;00m\n",
" \u001b[33m\u001b[39;49;00m\n",
" \u001b[33m There is an additional optional argument, \"input\", allowing you to\u001b[39;49;00m\n",
" \u001b[33m pass a string to the subprocess's stdin. If you use this argument\u001b[39;49;00m\n",
" \u001b[33m you may not also use the Popen constructor's \"stdin\" argument, as\u001b[39;49;00m\n",
" \u001b[33m it too will be used internally. Example:\u001b[39;49;00m\n",
" \u001b[33m\u001b[39;49;00m\n",
" \u001b[33m >>> check_output([\"sed\", \"-e\", \"s/foo/bar/\"],\u001b[39;49;00m\n",
" \u001b[33m ... input=b\"when in the course of fooman events\\n\")\u001b[39;49;00m\n",
" \u001b[33m b'when in the course of barman events\\n'\u001b[39;49;00m\n",
" \u001b[33m\u001b[39;49;00m\n",
" \u001b[33m By default, all communication is in bytes, and therefore any \"input\"\u001b[39;49;00m\n",
" \u001b[33m should be bytes, and the return value will be bytes. If in text mode,\u001b[39;49;00m\n",
" \u001b[33m any \"input\" should be a string, and the return value will be a string\u001b[39;49;00m\n",
" \u001b[33m decoded according to locale encoding, or by \"encoding\" if set. Text mode\u001b[39;49;00m\n",
" \u001b[33m is triggered by setting any of text, encoding, errors or universal_newlines.\u001b[39;49;00m\n",
" \u001b[33m \"\"\"\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mif\u001b[39;49;00m \u001b[33m'\u001b[39;49;00m\u001b[33mstdout\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m \u001b[95min\u001b[39;49;00m kwargs:\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mraise\u001b[39;49;00m \u001b[96mValueError\u001b[39;49;00m(\u001b[33m'\u001b[39;49;00m\u001b[33mstdout argument not allowed, it will be overridden.\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\u001b[90m\u001b[39;49;00m\n",
" \u001b[90m\u001b[39;49;00m\n",
" \u001b[94mif\u001b[39;49;00m \u001b[33m'\u001b[39;49;00m\u001b[33minput\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m \u001b[95min\u001b[39;49;00m kwargs \u001b[95mand\u001b[39;49;00m kwargs[\u001b[33m'\u001b[39;49;00m\u001b[33minput\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m] \u001b[95mis\u001b[39;49;00m \u001b[94mNone\u001b[39;49;00m:\u001b[90m\u001b[39;49;00m\n",
" \u001b[90m# Explicitly passing input=None was previously equivalent to passing an\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" \u001b[90m# empty string. That is maintained here for backwards compatibility.\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mif\u001b[39;49;00m kwargs.get(\u001b[33m'\u001b[39;49;00m\u001b[33muniversal_newlines\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m) \u001b[95mor\u001b[39;49;00m kwargs.get(\u001b[33m'\u001b[39;49;00m\u001b[33mtext\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m) \u001b[95mor\u001b[39;49;00m kwargs.get(\u001b[33m'\u001b[39;49;00m\u001b[33mencoding\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m) \\\n",
" \u001b[95mor\u001b[39;49;00m kwargs.get(\u001b[33m'\u001b[39;49;00m\u001b[33merrors\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m):\u001b[90m\u001b[39;49;00m\n",
" empty = \u001b[33m'\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" \u001b[94melse\u001b[39;49;00m:\u001b[90m\u001b[39;49;00m\n",
" empty = \u001b[33mb\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" kwargs[\u001b[33m'\u001b[39;49;00m\u001b[33minput\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m] = empty\u001b[90m\u001b[39;49;00m\n",
" \u001b[90m\u001b[39;49;00m\n",
"> \u001b[94mreturn\u001b[39;49;00m run(*popenargs, stdout=PIPE, timeout=timeout, check=\u001b[94mTrue\u001b[39;49;00m,\u001b[90m\u001b[39;49;00m\n",
" **kwargs).stdout\u001b[90m\u001b[39;49;00m\n",
"\n",
"\u001b[1m\u001b[31m/usr/lib/python3.10/subprocess.py\u001b[0m:421: \n",
"_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n",
"\n",
"input = None, capture_output = False, timeout = None, check = True\n",
"popenargs = (['accelerate', 'launch', '--config_file', '/tmp/tmp_s5b6aj5/default_config.yml', 'examples/by_feature/checkpointing.py', '--checkpointing_steps', ...],)\n",
"kwargs = {'stderr': -2, 'stdout': -1}\n",
"process = <Popen: returncode: 1 args: ['accelerate', 'launch', '--config_file', '/tmp/...>\n",
"stdout = b'2023-05-09 09:09:11.875999: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find T...py\\', \\'--checkpointing_steps\\', \\'1\\', \\n\\'--output_dir\\', \\'/tmp/tmpg9wnngc2\\']\\' returned non-zero exit status 1.\\n'\n",
"stderr = None, retcode = 1\n",
"\n",
" \u001b[94mdef\u001b[39;49;00m \u001b[92mrun\u001b[39;49;00m(*popenargs,\u001b[90m\u001b[39;49;00m\n",
" \u001b[96minput\u001b[39;49;00m=\u001b[94mNone\u001b[39;49;00m, capture_output=\u001b[94mFalse\u001b[39;49;00m, timeout=\u001b[94mNone\u001b[39;49;00m, check=\u001b[94mFalse\u001b[39;49;00m, **kwargs):\u001b[90m\u001b[39;49;00m\n",
" \u001b[90m \u001b[39;49;00m\u001b[33m\"\"\"Run command with arguments and return a CompletedProcess instance.\u001b[39;49;00m\n",
" \u001b[33m\u001b[39;49;00m\n",
" \u001b[33m The returned instance will have attributes args, returncode, stdout and\u001b[39;49;00m\n",
" \u001b[33m stderr. By default, stdout and stderr are not captured, and those attributes\u001b[39;49;00m\n",
" \u001b[33m will be None. Pass stdout=PIPE and/or stderr=PIPE in order to capture them,\u001b[39;49;00m\n",
" \u001b[33m or pass capture_output=True to capture both.\u001b[39;49;00m\n",
" \u001b[33m\u001b[39;49;00m\n",
" \u001b[33m If check is True and the exit code was non-zero, it raises a\u001b[39;49;00m\n",
" \u001b[33m CalledProcessError. The CalledProcessError object will have the return code\u001b[39;49;00m\n",
" \u001b[33m in the returncode attribute, and output & stderr attributes if those streams\u001b[39;49;00m\n",
" \u001b[33m were captured.\u001b[39;49;00m\n",
" \u001b[33m\u001b[39;49;00m\n",
" \u001b[33m If timeout is given, and the process takes too long, a TimeoutExpired\u001b[39;49;00m\n",
" \u001b[33m exception will be raised.\u001b[39;49;00m\n",
" \u001b[33m\u001b[39;49;00m\n",
" \u001b[33m There is an optional argument \"input\", allowing you to\u001b[39;49;00m\n",
" \u001b[33m pass bytes or a string to the subprocess's stdin. If you use this argument\u001b[39;49;00m\n",
" \u001b[33m you may not also use the Popen constructor's \"stdin\" argument, as\u001b[39;49;00m\n",
" \u001b[33m it will be used internally.\u001b[39;49;00m\n",
" \u001b[33m\u001b[39;49;00m\n",
" \u001b[33m By default, all communication is in bytes, and therefore any \"input\" should\u001b[39;49;00m\n",
" \u001b[33m be bytes, and the stdout and stderr will be bytes. If in text mode, any\u001b[39;49;00m\n",
" \u001b[33m \"input\" should be a string, and stdout and stderr will be strings decoded\u001b[39;49;00m\n",
" \u001b[33m according to locale encoding, or by \"encoding\" if set. Text mode is\u001b[39;49;00m\n",
" \u001b[33m triggered by setting any of text, encoding, errors or universal_newlines.\u001b[39;49;00m\n",
" \u001b[33m\u001b[39;49;00m\n",
" \u001b[33m The other arguments are the same as for the Popen constructor.\u001b[39;49;00m\n",
" \u001b[33m \"\"\"\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mif\u001b[39;49;00m \u001b[96minput\u001b[39;49;00m \u001b[95mis\u001b[39;49;00m \u001b[95mnot\u001b[39;49;00m \u001b[94mNone\u001b[39;49;00m:\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mif\u001b[39;49;00m kwargs.get(\u001b[33m'\u001b[39;49;00m\u001b[33mstdin\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m) \u001b[95mis\u001b[39;49;00m \u001b[95mnot\u001b[39;49;00m \u001b[94mNone\u001b[39;49;00m:\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mraise\u001b[39;49;00m \u001b[96mValueError\u001b[39;49;00m(\u001b[33m'\u001b[39;49;00m\u001b[33mstdin and input arguments may not both be used.\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\u001b[90m\u001b[39;49;00m\n",
" kwargs[\u001b[33m'\u001b[39;49;00m\u001b[33mstdin\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m] = PIPE\u001b[90m\u001b[39;49;00m\n",
" \u001b[90m\u001b[39;49;00m\n",
" \u001b[94mif\u001b[39;49;00m capture_output:\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mif\u001b[39;49;00m kwargs.get(\u001b[33m'\u001b[39;49;00m\u001b[33mstdout\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m) \u001b[95mis\u001b[39;49;00m \u001b[95mnot\u001b[39;49;00m \u001b[94mNone\u001b[39;49;00m \u001b[95mor\u001b[39;49;00m kwargs.get(\u001b[33m'\u001b[39;49;00m\u001b[33mstderr\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m) \u001b[95mis\u001b[39;49;00m \u001b[95mnot\u001b[39;49;00m \u001b[94mNone\u001b[39;49;00m:\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mraise\u001b[39;49;00m \u001b[96mValueError\u001b[39;49;00m(\u001b[33m'\u001b[39;49;00m\u001b[33mstdout and stderr arguments may not be used \u001b[39;49;00m\u001b[33m'\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" \u001b[33m'\u001b[39;49;00m\u001b[33mwith capture_output.\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\u001b[90m\u001b[39;49;00m\n",
" kwargs[\u001b[33m'\u001b[39;49;00m\u001b[33mstdout\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m] = PIPE\u001b[90m\u001b[39;49;00m\n",
" kwargs[\u001b[33m'\u001b[39;49;00m\u001b[33mstderr\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m] = PIPE\u001b[90m\u001b[39;49;00m\n",
" \u001b[90m\u001b[39;49;00m\n",
" \u001b[94mwith\u001b[39;49;00m Popen(*popenargs, **kwargs) \u001b[94mas\u001b[39;49;00m process:\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mtry\u001b[39;49;00m:\u001b[90m\u001b[39;49;00m\n",
" stdout, stderr = process.communicate(\u001b[96minput\u001b[39;49;00m, timeout=timeout)\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mexcept\u001b[39;49;00m TimeoutExpired \u001b[94mas\u001b[39;49;00m exc:\u001b[90m\u001b[39;49;00m\n",
" process.kill()\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mif\u001b[39;49;00m _mswindows:\u001b[90m\u001b[39;49;00m\n",
" \u001b[90m# Windows accumulates the output in a single blocking\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" \u001b[90m# read() call run on child threads, with the timeout\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" \u001b[90m# being done in a join() on those threads. communicate()\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" \u001b[90m# _after_ kill() is required to collect that and add it\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" \u001b[90m# to the exception.\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" exc.stdout, exc.stderr = process.communicate()\u001b[90m\u001b[39;49;00m\n",
" \u001b[94melse\u001b[39;49;00m:\u001b[90m\u001b[39;49;00m\n",
" \u001b[90m# POSIX _communicate already populated the output so\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" \u001b[90m# far into the TimeoutExpired exception.\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" process.wait()\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mraise\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mexcept\u001b[39;49;00m: \u001b[90m# Including KeyboardInterrupt, communicate handled that.\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" process.kill()\u001b[90m\u001b[39;49;00m\n",
" \u001b[90m# We don't call process.wait() as .__exit__ does that for us.\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mraise\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" retcode = process.poll()\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mif\u001b[39;49;00m check \u001b[95mand\u001b[39;49;00m retcode:\u001b[90m\u001b[39;49;00m\n",
"> \u001b[94mraise\u001b[39;49;00m CalledProcessError(retcode, process.args,\u001b[90m\u001b[39;49;00m\n",
" output=stdout, stderr=stderr)\u001b[90m\u001b[39;49;00m\n",
"\u001b[1m\u001b[31mE subprocess.CalledProcessError: Command '['accelerate', 'launch', '--config_file', '/tmp/tmp_s5b6aj5/default_config.yml', 'examples/by_feature/checkpointing.py', '--checkpointing_steps', '1', '--output_dir', '/tmp/tmpg9wnngc2']' returned non-zero exit status 1.\u001b[0m\n",
"\n",
"\u001b[1m\u001b[31m/usr/lib/python3.10/subprocess.py\u001b[0m:526: CalledProcessError\n",
"\n",
"\u001b[33mThe above exception was the direct cause of the following exception:\u001b[0m\n",
"\n",
"self = <test_examples.FeatureExamplesTests testMethod=test_checkpointing_by_steps>\n",
"\n",
" \u001b[94mdef\u001b[39;49;00m \u001b[92mtest_checkpointing_by_steps\u001b[39;49;00m(\u001b[96mself\u001b[39;49;00m):\u001b[90m\u001b[39;49;00m\n",
" testargs = \u001b[33mf\u001b[39;49;00m\u001b[33m\"\"\"\u001b[39;49;00m\u001b[33m\u001b[39;49;00m\n",
" \u001b[33m examples/by_feature/checkpointing.py\u001b[39;49;00m\u001b[33m\u001b[39;49;00m\n",
" \u001b[33m --checkpointing_steps 1\u001b[39;49;00m\u001b[33m\u001b[39;49;00m\n",
" \u001b[33m --output_dir \u001b[39;49;00m\u001b[33m{\u001b[39;49;00m\u001b[96mself\u001b[39;49;00m.tmpdir\u001b[33m}\u001b[39;49;00m\u001b[33m\u001b[39;49;00m\n",
" \u001b[33m \u001b[39;49;00m\u001b[33m\"\"\"\u001b[39;49;00m.split()\u001b[90m\u001b[39;49;00m\n",
"> _ = run_command(\u001b[96mself\u001b[39;49;00m._launch_args + testargs)\u001b[90m\u001b[39;49;00m\n",
"\n",
"\u001b[1m\u001b[31mtests/test_examples.py\u001b[0m:158: \n",
"_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n",
"\n",
"command = ['accelerate', 'launch', '--config_file', '/tmp/tmp_s5b6aj5/default_config.yml', 'examples/by_feature/checkpointing.py', '--checkpointing_steps', ...]\n",
"return_stdout = False\n",
"\n",
" \u001b[94mdef\u001b[39;49;00m \u001b[92mrun_command\u001b[39;49;00m(command: List[\u001b[96mstr\u001b[39;49;00m], return_stdout=\u001b[94mFalse\u001b[39;49;00m):\u001b[90m\u001b[39;49;00m\n",
" \u001b[90m \u001b[39;49;00m\u001b[33m\"\"\"\u001b[39;49;00m\n",
" \u001b[33m Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture\u001b[39;49;00m\n",
" \u001b[33m if an error occured while running `command`\u001b[39;49;00m\n",
" \u001b[33m \"\"\"\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mtry\u001b[39;49;00m:\u001b[90m\u001b[39;49;00m\n",
" output = subprocess.check_output(command, stderr=subprocess.STDOUT)\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mif\u001b[39;49;00m return_stdout:\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mif\u001b[39;49;00m \u001b[96mhasattr\u001b[39;49;00m(output, \u001b[33m\"\u001b[39;49;00m\u001b[33mdecode\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m):\u001b[90m\u001b[39;49;00m\n",
" output = output.decode(\u001b[33m\"\u001b[39;49;00m\u001b[33mutf-8\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mreturn\u001b[39;49;00m output\u001b[90m\u001b[39;49;00m\n",
" \u001b[94mexcept\u001b[39;49;00m subprocess.CalledProcessError \u001b[94mas\u001b[39;49;00m e:\u001b[90m\u001b[39;49;00m\n",
"> \u001b[94mraise\u001b[39;49;00m SubprocessCallException(\u001b[90m\u001b[39;49;00m\n",
" \u001b[33mf\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m\u001b[33mCommand `\u001b[39;49;00m\u001b[33m{\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m\u001b[33m \u001b[39;49;00m\u001b[33m'\u001b[39;49;00m.join(command)\u001b[33m}\u001b[39;49;00m\u001b[33m` failed with the following error:\u001b[39;49;00m\u001b[33m\\n\u001b[39;49;00m\u001b[33m\\n\u001b[39;49;00m\u001b[33m{\u001b[39;49;00me.output.decode()\u001b[33m}\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
" ) \u001b[94mfrom\u001b[39;49;00m \u001b[04m\u001b[96me\u001b[39;49;00m\u001b[90m\u001b[39;49;00m\n",
"\u001b[1m\u001b[31mE accelerate.test_utils.testing.SubprocessCallException: Command `accelerate launch --config_file /tmp/tmp_s5b6aj5/default_config.yml examples/by_feature/checkpointing.py --checkpointing_steps 1 --output_dir /tmp/tmpg9wnngc2` failed with the following error:\u001b[0m\n",
"\u001b[1m\u001b[31mE \u001b[0m\n",
"\u001b[1m\u001b[31mE 2023-05-09 09:09:11.875999: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\u001b[0m\n",
"\u001b[1m\u001b[31mE [09:09:12] WARNING The following values were not passed to launch.py:900\u001b[0m\n",
"\u001b[1m\u001b[31mE `accelerate launch` and had defaults used \u001b[0m\n",
"\u001b[1m\u001b[31mE instead: \u001b[0m\n",
"\u001b[1m\u001b[31mE `--num_cpu_threads_per_process` was \u001b[0m\n",
"\u001b[1m\u001b[31mE set to `4` to improve out-of-box performance \u001b[0m\n",
"\u001b[1m\u001b[31mE when training on CPUs \u001b[0m\n",
"\u001b[1m\u001b[31mE To avoid this warning pass in values for each \u001b[0m\n",
"\u001b[1m\u001b[31mE of the problematic parameters or run \u001b[0m\n",
"\u001b[1m\u001b[31mE `accelerate config`. \u001b[0m\n",
"\u001b[1m\u001b[31mE 2023-05-09 09:09:16.937243: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\u001b[0m\n",
"\u001b[1m\u001b[31mE ╭───────────────────── Traceback (most recent call last) ──────────────────────╮\u001b[0m\n",
"\u001b[1m\u001b[31mE │ /content/accelerate/examples/by_feature/checkpointing.py:316 in <module> │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 313 │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 314 │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 315 if __name__ == \"__main__\": │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ ❱ 316 │ main() │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 317 │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ /content/accelerate/examples/by_feature/checkpointing.py:312 in main │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 309 │ ) │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 310 │ args = parser.parse_args() │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 311 │ config = {\"lr\": 2e-5, \"num_epochs\": 3, \"seed\": 42, \"batch_size\": 1 │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ ❱ 312 │ training_function(config, args) │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 313 │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 314 │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 315 if __name__ == \"__main__\": │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ /content/accelerate/examples/by_feature/checkpointing.py:130 in │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ training_function │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 127 │ │ config[\"num_epochs\"] = 2 │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 128 │ # Initialize accelerator │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 129 │ accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ ❱ 130 │ assert accelerator.device.type == \"cuda\", f'Device: {accelerator.d │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 131 │ # Sample hyper-parameters for learning rate, batch size, seed and │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 132 │ lr = config[\"lr\"] │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 133 │ num_epochs = int(config[\"num_epochs\"]) │\u001b[0m\n",
"\u001b[1m\u001b[31mE ╰──────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
"\u001b[1m\u001b[31mE AssertionError: Device: cpu, type: cpu\u001b[0m\n",
"\u001b[1m\u001b[31mE ╭───────────────────── Traceback (most recent call last) ──────────────────────╮\u001b[0m\n",
"\u001b[1m\u001b[31mE │ /usr/local/bin/accelerate:8 in <module> │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 5 from accelerate.commands.accelerate_cli import main │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 6 if __name__ == '__main__': │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 7 │ sys.argv[0] = re.sub(r'(-script\\.pyw|\\.exe)?$', '', sys.argv[0]) │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ ❱ 8 │ sys.exit(main()) │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 9 │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ /content/accelerate/src/accelerate/commands/accelerate_cli.py:45 in main │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 42 │ │ exit(1) │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 43 │ │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 44 │ # Run │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ ❱ 45 │ args.func(args) │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 46 │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 47 │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 48 if __name__ == \"__main__\": │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ /content/accelerate/src/accelerate/commands/launch.py:928 in launch_command │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 925 │ elif defaults is not None and defaults.compute_environment == Comp │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 926 │ │ sagemaker_launcher(defaults, args) │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 927 │ else: │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ ❱ 928 │ │ simple_launcher(args) │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 929 │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 930 │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 931 def main(): │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ /content/accelerate/src/accelerate/commands/launch.py:588 in simple_launcher │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 585 │ process.wait() │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 586 │ if process.returncode != 0: │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 587 │ │ if not args.quiet: │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ ❱ 588 │ │ │ raise subprocess.CalledProcessError(returncode=process.ret │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 589 │ │ else: │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 590 │ │ │ sys.exit(1) │\u001b[0m\n",
"\u001b[1m\u001b[31mE │ 591 │\u001b[0m\n",
"\u001b[1m\u001b[31mE ╰──────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
"\u001b[1m\u001b[31mE CalledProcessError: Command '['/usr/bin/python3', \u001b[0m\n",
"\u001b[1m\u001b[31mE 'examples/by_feature/checkpointing.py', '--checkpointing_steps', '1', \u001b[0m\n",
"\u001b[1m\u001b[31mE '--output_dir', '/tmp/tmpg9wnngc2']' returned non-zero exit status 1.\u001b[0m\n",
"\n",
"\u001b[1m\u001b[31msrc/accelerate/test_utils/testing.py\u001b[0m:407: SubprocessCallException\n",
"\u001b[36m\u001b[1m=========================== short test summary info ============================\u001b[0m\n",
"\u001b[31mFAILED\u001b[0m tests/test_examples.py::\u001b[1mFeatureExamplesTests::test_checkpointing_by_steps\u001b[0m - accelerate.test_utils.testing.SubprocessCallException: Command `accelerate ...\n",
"\u001b[31m============================== \u001b[31m\u001b[1m1 failed\u001b[0m\u001b[31m in 22.06s\u001b[0m\u001b[31m ==============================\u001b[0m\n"
]
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "kQUP_EBRw-wc"
},
"execution_count": null,
"outputs": []
}
],
"metadata": {
"colab": {
"name": "scratchpad",
"provenance": [],
"machine_shape": "hm",
"gpuType": "T4",
"include_colab_link": true
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"accelerator": "GPU",
"gpuClass": "standard"
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment