-
-
Save justheuristic/149ccfbf903a847cbaa09dbe59965bd9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"env: CUDA_VISIBLE_DEVICES=1,3\n", | |
"env: LD_LIBRARY_PATH=\n", | |
"env: PATH='/home/jheuristic/.local/bin:/home/jheuristic/anaconda3/envs/py38_petals_yozh:/home/jheuristic/anaconda3/bin:/home/jheuristic/anaconda3/bin:/home/jheuristic/anaconda3/bin:/home/jheuristic/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/home/jheuristic/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/jheuristic/go/bin'\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/jheuristic/anaconda3/envs/py38_petals_yozh/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", | |
" from .autonotebook import tqdm as notebook_tqdm\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"===================================BUG REPORT===================================\n", | |
"Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n", | |
"For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link\n", | |
"================================================================================\n", | |
"CUDA SETUP: CUDA runtime path found: /home/jheuristic/anaconda3/envs/py38_petals_yozh/lib/libcudart.so\n", | |
"CUDA SETUP: Highest compute capability among GPUs detected: 6.1\n", | |
"CUDA SETUP: Detected CUDA version 113\n", | |
"CUDA SETUP: Loading binary /home/jheuristic/anaconda3/envs/py38_petals_yozh/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda113_nocublaslt.so...\n" | |
] | |
} | |
], | |
"source": [ | |
"%env CUDA_VISIBLE_DEVICES=1,3\n", | |
"%env LD_LIBRARY_PATH=\n", | |
"%load_ext autoreload\n", | |
"%autoreload 2\n", | |
"import os\n", | |
"os.environ[\"CONDA_PREFIX\"]=os.path.dirname(os.path.dirname(os.path.dirname(os.__file__)))\n", | |
"%env PATH='/home/jheuristic/.local/bin:/home/jheuristic/anaconda3/envs/py38_petals_yozh:/home/jheuristic/anaconda3/bin:/home/jheuristic/anaconda3/bin:/home/jheuristic/anaconda3/bin:/home/jheuristic/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/home/jheuristic/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/jheuristic/go/bin'\n", | |
"\n", | |
"from petals.bloom.from_pretrained import load_pretrained_block\n", | |
"from petals.utils.tensor_parallel import TensorParallel, Config\n", | |
"import torch\n", | |
"\n", | |
"import transformers\n", | |
"model_config = transformers.AutoConfig.from_pretrained(\"bigscience/bloom\")\n", | |
"devices=['cuda:0', 'cuda:1']" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Baseline (single GPU)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from petals.bloom.block import WrappedBloomBlock\n", | |
"block = WrappedBloomBlock(model_config).to(torch.bfloat16).to(devices[0])\n", | |
"for param in block.parameters():\n", | |
" param.requires_grad = False\n", | |
"x = torch.randn(1, 64, 14336, requires_grad=True, device=devices[0], dtype=torch.bfloat16)\n", | |
"\n", | |
"# init cuda\n", | |
"y, = block(x);\n", | |
"del y\n", | |
"torch.cuda.synchronize()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 686 ms, sys: 146 ms, total: 832 ms\n", | |
"Wall time: 831 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"for i in range(10):\n", | |
" y, = block(x);\n", | |
" del y\n", | |
"torch.cuda.synchronize()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1.14 s, sys: 319 ms, total: 1.45 s\n", | |
"Wall time: 1.45 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"for i in range(10):\n", | |
" y, = block(x);\n", | |
" y.norm().backward()\n", | |
" del y\n", | |
"torch.cuda.synchronize()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"del block, x\n", | |
"torch.cuda.synchronize()\n", | |
"torch.cuda.empty_cache()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## TP, two gpus, auto config" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Dec 14 12:49:23.152 [INFO] Using automatic config: sharding individual linear/conv/emb layers\n" | |
] | |
} | |
], | |
"source": [ | |
"from petals.bloom.block import WrappedBloomBlock\n", | |
"block_tp = WrappedBloomBlock(model_config).to(torch.bfloat16)\n", | |
"for param in block_tp.parameters():\n", | |
" param.requires_grad = False\n", | |
"\n", | |
"block_tp = TensorParallel(block_tp, devices)\n", | |
"x = torch.randn(1, 64, 14336, requires_grad=True, device=devices[0], dtype=torch.bfloat16)\n", | |
"\n", | |
"# init nccl\n", | |
"y = block_tp(x);\n", | |
"del y\n", | |
"torch.cuda.synchronize()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 661 ms, sys: 149 ms, total: 810 ms\n", | |
"Wall time: 439 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"for i in range(10):\n", | |
" y, = block_tp(x);\n", | |
" del y\n", | |
"torch.cuda.synchronize()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1.14 s, sys: 292 ms, total: 1.43 s\n", | |
"Wall time: 779 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"for i in range(10):\n", | |
" y, = block_tp(x);\n", | |
" y.norm().backward()\n", | |
" del y\n", | |
"torch.cuda.synchronize()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"del block_tp, x\n", | |
"torch.cuda.synchronize()\n", | |
"torch.cuda.empty_cache()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## TP, two gpus, better config (but not optimal!)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from petals.bloom.block import WrappedBloomBlock\n", | |
"block_tp = WrappedBloomBlock(model_config).to(torch.bfloat16)\n", | |
"for param in block_tp.parameters():\n", | |
" param.requires_grad = False\n", | |
"tp_config = Config(\n", | |
" state_rules={\n", | |
" \".*self_attention\\.query_key_value\\.(weight|bias)\": \"split 0\",\n", | |
" \".*self_attention\\.dense\\.(weight|bias)\": \"split 0\",\n", | |
" \".*mlp\\.dense_h_to_4h\\.(weight|bias)\": \"split 0\",\n", | |
" \".*mlp\\.dense_4h_to_h\\.weight\": \"split 1\",\n", | |
" \".*mlp\\.dense_4h_to_h\\.bias\": \"scale\",\n", | |
" },\n", | |
" input_rules={},\n", | |
" output_rules={\n", | |
" \".*self_attention\\.query_key_value\": {0: \"gather -1\"},\n", | |
" \".*self_attention\\.dense\": {0: \"gather -1\"},\n", | |
" \".*mlp\\.dense_4h_to_h$\": {0: \"sum\"},\n", | |
" },\n", | |
" attr_rules={},\n", | |
")\n", | |
"\n", | |
"block_tp = TensorParallel(block_tp, devices, config=tp_config)\n", | |
"x = torch.randn(1, 64, 14336, requires_grad=True, device=devices[0], dtype=torch.bfloat16)\n", | |
"\n", | |
"# init nccl\n", | |
"y = block_tp(x);\n", | |
"del y\n", | |
"torch.cuda.synchronize()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 634 ms, sys: 165 ms, total: 799 ms\n", | |
"Wall time: 432 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"for i in range(10):\n", | |
" y, = block_tp(x);\n", | |
" del y\n", | |
"torch.cuda.synchronize()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1.18 s, sys: 223 ms, total: 1.4 s\n", | |
"Wall time: 756 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"for i in range(10):\n", | |
" y, = block_tp(x);\n", | |
" y.norm().backward()\n", | |
" del y\n", | |
"torch.cuda.synchronize()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"del block_tp, x\n", | |
"torch.cuda.synchronize()\n", | |
"torch.cuda.empty_cache()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"('NVIDIA GeForce GTX 1080 Ti', 'NVIDIA GeForce GTX 1080 Ti')" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"torch.cuda.get_device_name(devices[0]), torch.cuda.get_device_name(devices[1])" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "py38_petals_yozh", | |
"language": "python", | |
"name": "py38_petals_yozh" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.15" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment