Skip to content

Instantly share code, notes, and snippets.

@justheuristic
Last active December 14, 2022 12:51
Show Gist options
  • Save justheuristic/149ccfbf903a847cbaa09dbe59965bd9 to your computer and use it in GitHub Desktop.
Save justheuristic/149ccfbf903a847cbaa09dbe59965bd9 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"env: CUDA_VISIBLE_DEVICES=1,3\n",
"env: LD_LIBRARY_PATH=\n",
"env: PATH='/home/jheuristic/.local/bin:/home/jheuristic/anaconda3/envs/py38_petals_yozh:/home/jheuristic/anaconda3/bin:/home/jheuristic/anaconda3/bin:/home/jheuristic/anaconda3/bin:/home/jheuristic/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/home/jheuristic/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/jheuristic/go/bin'\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/jheuristic/anaconda3/envs/py38_petals_yozh/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"===================================BUG REPORT===================================\n",
"Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
"For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link\n",
"================================================================================\n",
"CUDA SETUP: CUDA runtime path found: /home/jheuristic/anaconda3/envs/py38_petals_yozh/lib/libcudart.so\n",
"CUDA SETUP: Highest compute capability among GPUs detected: 6.1\n",
"CUDA SETUP: Detected CUDA version 113\n",
"CUDA SETUP: Loading binary /home/jheuristic/anaconda3/envs/py38_petals_yozh/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda113_nocublaslt.so...\n"
]
}
],
"source": [
"%env CUDA_VISIBLE_DEVICES=1,3\n",
"%env LD_LIBRARY_PATH=\n",
"%load_ext autoreload\n",
"%autoreload 2\n",
"import os\n",
"os.environ[\"CONDA_PREFIX\"]=os.path.dirname(os.path.dirname(os.path.dirname(os.__file__)))\n",
"%env PATH='/home/jheuristic/.local/bin:/home/jheuristic/anaconda3/envs/py38_petals_yozh:/home/jheuristic/anaconda3/bin:/home/jheuristic/anaconda3/bin:/home/jheuristic/anaconda3/bin:/home/jheuristic/.vscode-server/bin/b06ae3b2d2dbfe28bca3134cc6be65935cdfea6a/bin/remote-cli:/home/jheuristic/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/jheuristic/go/bin'\n",
"\n",
"from petals.bloom.from_pretrained import load_pretrained_block\n",
"from petals.utils.tensor_parallel import TensorParallel, Config\n",
"import torch\n",
"\n",
"import transformers\n",
"model_config = transformers.AutoConfig.from_pretrained(\"bigscience/bloom\")\n",
"devices=['cuda:0', 'cuda:1']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Baseline (single GPU)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from petals.bloom.block import WrappedBloomBlock\n",
"block = WrappedBloomBlock(model_config).to(torch.bfloat16).to(devices[0])\n",
"for param in block.parameters():\n",
" param.requires_grad = False\n",
"x = torch.randn(1, 64, 14336, requires_grad=True, device=devices[0], dtype=torch.bfloat16)\n",
"\n",
"# init cuda\n",
"y, = block(x);\n",
"del y\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 686 ms, sys: 146 ms, total: 832 ms\n",
"Wall time: 831 ms\n"
]
}
],
"source": [
"%%time\n",
"for i in range(10):\n",
" y, = block(x);\n",
" del y\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.14 s, sys: 319 ms, total: 1.45 s\n",
"Wall time: 1.45 s\n"
]
}
],
"source": [
"%%time\n",
"for i in range(10):\n",
" y, = block(x);\n",
" y.norm().backward()\n",
" del y\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"del block, x\n",
"torch.cuda.synchronize()\n",
"torch.cuda.empty_cache()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TP, two gpus, auto config"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Dec 14 12:49:23.152 [INFO] Using automatic config: sharding individual linear/conv/emb layers\n"
]
}
],
"source": [
"from petals.bloom.block import WrappedBloomBlock\n",
"block_tp = WrappedBloomBlock(model_config).to(torch.bfloat16)\n",
"for param in block_tp.parameters():\n",
" param.requires_grad = False\n",
"\n",
"block_tp = TensorParallel(block_tp, devices)\n",
"x = torch.randn(1, 64, 14336, requires_grad=True, device=devices[0], dtype=torch.bfloat16)\n",
"\n",
"# init nccl\n",
"y = block_tp(x);\n",
"del y\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 661 ms, sys: 149 ms, total: 810 ms\n",
"Wall time: 439 ms\n"
]
}
],
"source": [
"%%time\n",
"for i in range(10):\n",
" y, = block_tp(x);\n",
" del y\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.14 s, sys: 292 ms, total: 1.43 s\n",
"Wall time: 779 ms\n"
]
}
],
"source": [
"%%time\n",
"for i in range(10):\n",
" y, = block_tp(x);\n",
" y.norm().backward()\n",
" del y\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"del block_tp, x\n",
"torch.cuda.synchronize()\n",
"torch.cuda.empty_cache()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TP, two gpus, better config (but not optimal!)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from petals.bloom.block import WrappedBloomBlock\n",
"block_tp = WrappedBloomBlock(model_config).to(torch.bfloat16)\n",
"for param in block_tp.parameters():\n",
" param.requires_grad = False\n",
"tp_config = Config(\n",
" state_rules={\n",
" \".*self_attention\\.query_key_value\\.(weight|bias)\": \"split 0\",\n",
" \".*self_attention\\.dense\\.(weight|bias)\": \"split 0\",\n",
" \".*mlp\\.dense_h_to_4h\\.(weight|bias)\": \"split 0\",\n",
" \".*mlp\\.dense_4h_to_h\\.weight\": \"split 1\",\n",
" \".*mlp\\.dense_4h_to_h\\.bias\": \"scale\",\n",
" },\n",
" input_rules={},\n",
" output_rules={\n",
" \".*self_attention\\.query_key_value\": {0: \"gather -1\"},\n",
" \".*self_attention\\.dense\": {0: \"gather -1\"},\n",
" \".*mlp\\.dense_4h_to_h$\": {0: \"sum\"},\n",
" },\n",
" attr_rules={},\n",
")\n",
"\n",
"block_tp = TensorParallel(block_tp, devices, config=tp_config)\n",
"x = torch.randn(1, 64, 14336, requires_grad=True, device=devices[0], dtype=torch.bfloat16)\n",
"\n",
"# init nccl\n",
"y = block_tp(x);\n",
"del y\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 634 ms, sys: 165 ms, total: 799 ms\n",
"Wall time: 432 ms\n"
]
}
],
"source": [
"%%time\n",
"for i in range(10):\n",
" y, = block_tp(x);\n",
" del y\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.18 s, sys: 223 ms, total: 1.4 s\n",
"Wall time: 756 ms\n"
]
}
],
"source": [
"%%time\n",
"for i in range(10):\n",
" y, = block_tp(x);\n",
" y.norm().backward()\n",
" del y\n",
"torch.cuda.synchronize()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"del block_tp, x\n",
"torch.cuda.synchronize()\n",
"torch.cuda.empty_cache()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('NVIDIA GeForce GTX 1080 Ti', 'NVIDIA GeForce GTX 1080 Ti')"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"torch.cuda.get_device_name(devices[0]), torch.cuda.get_device_name(devices[1])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "py38_petals_yozh",
"language": "python",
"name": "py38_petals_yozh"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment