Created
September 19, 2023 10:53
-
-
Save iwiwi/2528e4121bd339ffa71a18b6b47ac868 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "cc34216f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/javascript": [ | |
"\n", | |
" setTimeout(function() {\n", | |
" var nbb_cell_id = 1;\n", | |
" var nbb_unformatted_code = \"import matplotlib.pyplot as plt\\nimport numpy as np\\nimport pandas as pd\\nfrom typing import Callable\\n\\n%matplotlib inline\\n%load_ext nb_black\\n\\n\\ndef human_format(num: float) -> str:\\n magnitude = 0\\n while abs(num) >= 1000:\\n magnitude += 1\\n num /= 1000.0\\n return \\\"%.2f %s\\\" % (num, [\\\"\\\", \\\"K\\\", \\\"M\\\", \\\"B\\\", \\\"T\\\", \\\"Q\\\"][magnitude])\";\n", | |
" var nbb_formatted_code = \"import matplotlib.pyplot as plt\\nimport numpy as np\\nimport pandas as pd\\nfrom typing import Callable\\n\\n%matplotlib inline\\n%load_ext nb_black\\n\\n\\ndef human_format(num: float) -> str:\\n magnitude = 0\\n while abs(num) >= 1000:\\n magnitude += 1\\n num /= 1000.0\\n return \\\"%.2f %s\\\" % (num, [\\\"\\\", \\\"K\\\", \\\"M\\\", \\\"B\\\", \\\"T\\\", \\\"Q\\\"][magnitude])\";\n", | |
" var nbb_cells = Jupyter.notebook.get_cells();\n", | |
" for (var i = 0; i < nbb_cells.length; ++i) {\n", | |
" if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", | |
" if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", | |
" nbb_cells[i].set_text(nbb_formatted_code);\n", | |
" }\n", | |
" break;\n", | |
" }\n", | |
" }\n", | |
" }, 500);\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.Javascript object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"import matplotlib.pyplot as plt\n", | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"from typing import Callable\n", | |
"\n", | |
"%matplotlib inline\n", | |
"%load_ext nb_black\n", | |
"\n", | |
"\n", | |
"def human_format(num: float) -> str:\n", | |
" magnitude = 0\n", | |
" while abs(num) >= 1000:\n", | |
" magnitude += 1\n", | |
" num /= 1000.0\n", | |
" return \"%.2f %s\" % (num, [\"\", \"K\", \"M\", \"B\", \"T\", \"Q\"][magnitude])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "6e9c9084", | |
"metadata": {}, | |
"source": [ | |
"# Preparation\n", | |
"\n", | |
"First, let's create a function that calculates the optimal number of training tokens given the number of parameters. See \"Scaling Laws: Approach 2\" in the following notebook:\n", | |
"https://github.com/karpathy/nanoGPT/blob/master/scaling_laws.ipynb\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "3d5b0799", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'2.29 B'" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"application/javascript": [ | |
"\n", | |
" setTimeout(function() {\n", | |
" var nbb_cell_id = 2;\n", | |
" var nbb_unformatted_code = \"def get_optimal_num_tokens(num_params: float) -> float:\\n m = 1.0409573169995892\\n c = 0.9353887152390791\\n return 10 ** (m * np.log10(num_params) + c)\\n\\n\\n# Example: GPT-2 small\\nhuman_format(get_optimal_num_tokens(124e6))\";\n", | |
" var nbb_formatted_code = \"def get_optimal_num_tokens(num_params: float) -> float:\\n m = 1.0409573169995892\\n c = 0.9353887152390791\\n return 10 ** (m * np.log10(num_params) + c)\\n\\n\\n# Example: GPT-2 small\\nhuman_format(get_optimal_num_tokens(124e6))\";\n", | |
" var nbb_cells = Jupyter.notebook.get_cells();\n", | |
" for (var i = 0; i < nbb_cells.length; ++i) {\n", | |
" if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", | |
" if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", | |
" nbb_cells[i].set_text(nbb_formatted_code);\n", | |
" }\n", | |
" break;\n", | |
" }\n", | |
" }\n", | |
" }, 500);\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.Javascript object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"def get_optimal_num_tokens(num_params: float) -> float:\n", | |
" m = 1.0409573169995892\n", | |
" c = 0.9353887152390791\n", | |
" return 10 ** (m * np.log10(num_params) + c)\n", | |
"\n", | |
"\n", | |
"# Example: GPT-2 small\n", | |
"human_format(get_optimal_num_tokens(124e6))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "19c5114e", | |
"metadata": {}, | |
"source": [ | |
"Next, we create a function that calculates the amount of flops required to complete the training when the number of parameters and the number of tokens are given. Although this is a very simple approximation formula, it is commonly used and looks to fall within 10% of a more accurate calculation (See Table A4 in Chinchilla paper)." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "ff16647a", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/javascript": [ | |
"\n", | |
" setTimeout(function() {\n", | |
" var nbb_cell_id = 3;\n", | |
" var nbb_unformatted_code = \"def get_approx_flops(num_params: float, num_tokens: float) -> float:\\n return 6.0 * num_params * num_tokens\";\n", | |
" var nbb_formatted_code = \"def get_approx_flops(num_params: float, num_tokens: float) -> float:\\n return 6.0 * num_params * num_tokens\";\n", | |
" var nbb_cells = Jupyter.notebook.get_cells();\n", | |
" for (var i = 0; i < nbb_cells.length; ++i) {\n", | |
" if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", | |
" if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", | |
" nbb_cells[i].set_text(nbb_formatted_code);\n", | |
" }\n", | |
" break;\n", | |
" }\n", | |
" }\n", | |
" }, 500);\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.Javascript object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"def get_approx_flops(num_params: float, num_tokens: float) -> float:\n", | |
" return 6.0 * num_params * num_tokens" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "33b4c809", | |
"metadata": {}, | |
"source": [ | |
"Let's do a sanity check. I will input the settings of a training I recently conducted with StableMB. Achieving a runtime efficiency of 50% might be a bit difficult, but some papers report such figures." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "fd317a45", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1.2464387464387463 days\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/javascript": [ | |
"\n", | |
" setTimeout(function() {\n", | |
" var nbb_cell_id = 4;\n", | |
" var nbb_unformatted_code = \"num_params = 280e6 # 280M\\nnum_tokens = 80e9 # 80B\\n\\nnum_gpus = 8\\nflops_per_gpu = 312e12 # 312TFLOPS (bf16)\\nefficiency = 0.5 # 50% efficiency\\n\\nestimated_flops = get_approx_flops(num_params, num_tokens)\\nestimated_num_days = (\\n estimated_flops / (flops_per_gpu * efficiency) / num_gpus / 60 / 60 / 24\\n)\\nprint(f\\\"{estimated_num_days} days\\\")\";\n", | |
" var nbb_formatted_code = \"num_params = 280e6 # 280M\\nnum_tokens = 80e9 # 80B\\n\\nnum_gpus = 8\\nflops_per_gpu = 312e12 # 312TFLOPS (bf16)\\nefficiency = 0.5 # 50% efficiency\\n\\nestimated_flops = get_approx_flops(num_params, num_tokens)\\nestimated_num_days = (\\n estimated_flops / (flops_per_gpu * efficiency) / num_gpus / 60 / 60 / 24\\n)\\nprint(f\\\"{estimated_num_days} days\\\")\";\n", | |
" var nbb_cells = Jupyter.notebook.get_cells();\n", | |
" for (var i = 0; i < nbb_cells.length; ++i) {\n", | |
" if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", | |
" if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", | |
" nbb_cells[i].set_text(nbb_formatted_code);\n", | |
" }\n", | |
" break;\n", | |
" }\n", | |
" }\n", | |
" }, 500);\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.Javascript object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"num_params = 280e6 # 280M\n", | |
"num_tokens = 80e9 # 80B\n", | |
"\n", | |
"num_gpus = 8\n", | |
"flops_per_gpu = 312e12 # 312TFLOPS (bf16)\n", | |
"efficiency = 0.5 # 50% efficiency\n", | |
"\n", | |
"estimated_flops = get_approx_flops(num_params, num_tokens)\n", | |
"estimated_num_days = (\n", | |
" estimated_flops / (flops_per_gpu * efficiency) / num_gpus / 60 / 60 / 24\n", | |
")\n", | |
"print(f\"{estimated_num_days} days\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "39efd08a", | |
"metadata": {}, | |
"source": [ | |
"In the actual training I actually conducted, it took almost 2 days. I haven't optimized the source code and settings that much yet, so the runtime efficiency is probably lower than 50%. For now, I think the sanity check was carried out correctly.\n", | |
"\n", | |
"Next, we will create a function that back-calculates the optimal model size given a computational budget." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "8c279916", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/javascript": [ | |
"\n", | |
" setTimeout(function() {\n", | |
" var nbb_cell_id = 5;\n", | |
" var nbb_unformatted_code = \"def binary_search(\\n cond: Callable[[int], bool],\\n num_iters: int = 100,\\n hi: float = 1e20,\\n) -> float:\\n lo = 0.0\\n for _ in range(num_iters):\\n mid = lo + (hi - lo + 1) // 2\\n if cond(mid):\\n lo = mid\\n else:\\n hi = mid - 1\\n return lo\\n\\n\\ndef get_optimal_num_params(total_flops: float) -> float:\\n def cond(num_params):\\n num_tokens = get_optimal_num_tokens(num_params)\\n return get_approx_flops(num_params, num_tokens) < total_flops\\n\\n return binary_search(cond)\";\n", | |
" var nbb_formatted_code = \"def binary_search(\\n cond: Callable[[int], bool],\\n num_iters: int = 100,\\n hi: float = 1e20,\\n) -> float:\\n lo = 0.0\\n for _ in range(num_iters):\\n mid = lo + (hi - lo + 1) // 2\\n if cond(mid):\\n lo = mid\\n else:\\n hi = mid - 1\\n return lo\\n\\n\\ndef get_optimal_num_params(total_flops: float) -> float:\\n def cond(num_params):\\n num_tokens = get_optimal_num_tokens(num_params)\\n return get_approx_flops(num_params, num_tokens) < total_flops\\n\\n return binary_search(cond)\";\n", | |
" var nbb_cells = Jupyter.notebook.get_cells();\n", | |
" for (var i = 0; i < nbb_cells.length; ++i) {\n", | |
" if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", | |
" if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", | |
" nbb_cells[i].set_text(nbb_formatted_code);\n", | |
" }\n", | |
" break;\n", | |
" }\n", | |
" }\n", | |
" }, 500);\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.Javascript object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"def binary_search(\n", | |
" cond: Callable[[int], bool],\n", | |
" num_iters: int = 100,\n", | |
" hi: float = 1e20,\n", | |
") -> float:\n", | |
" lo = 0.0\n", | |
" for _ in range(num_iters):\n", | |
" mid = lo + (hi - lo + 1) // 2\n", | |
" if cond(mid):\n", | |
" lo = mid\n", | |
" else:\n", | |
" hi = mid - 1\n", | |
" return lo\n", | |
"\n", | |
"\n", | |
"def get_optimal_num_params(total_flops: float) -> float:\n", | |
" def cond(num_params):\n", | |
" num_tokens = get_optimal_num_tokens(num_params)\n", | |
" return get_approx_flops(num_params, num_tokens) < total_flops\n", | |
"\n", | |
" return binary_search(cond)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "96345051", | |
"metadata": {}, | |
"source": [ | |
"Let's do another sanity check. If you can use 8 A100 GPUs for 2 days, like I actually did, what would be the optimal model size? It would be good if the value isn't significantly different from the actual experiment I conducted." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "d1a8e97c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Params: 1.33 B\n", | |
"Tokens: 27.06 B\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/javascript": [ | |
"\n", | |
" setTimeout(function() {\n", | |
" var nbb_cell_id = 6;\n", | |
" var nbb_unformatted_code = \"num_params = get_optimal_num_params(\\n flops_per_gpu * efficiency * num_gpus * 60 * 60 * 24 * 2\\n)\\nnum_tokens = get_optimal_num_tokens(num_params)\\n\\nprint(f\\\"Params: {human_format(num_params)}\\\")\\nprint(f\\\"Tokens: {human_format(num_tokens)}\\\")\";\n", | |
" var nbb_formatted_code = \"num_params = get_optimal_num_params(\\n flops_per_gpu * efficiency * num_gpus * 60 * 60 * 24 * 2\\n)\\nnum_tokens = get_optimal_num_tokens(num_params)\\n\\nprint(f\\\"Params: {human_format(num_params)}\\\")\\nprint(f\\\"Tokens: {human_format(num_tokens)}\\\")\";\n", | |
" var nbb_cells = Jupyter.notebook.get_cells();\n", | |
" for (var i = 0; i < nbb_cells.length; ++i) {\n", | |
" if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", | |
" if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", | |
" nbb_cells[i].set_text(nbb_formatted_code);\n", | |
" }\n", | |
" break;\n", | |
" }\n", | |
" }\n", | |
" }, 500);\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.Javascript object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"num_params = get_optimal_num_params(\n", | |
" flops_per_gpu * efficiency * num_gpus * 60 * 60 * 24 * 2\n", | |
")\n", | |
"num_tokens = get_optimal_num_tokens(num_params)\n", | |
"\n", | |
"print(f\"Params: {human_format(num_params)}\")\n", | |
"print(f\"Tokens: {human_format(num_tokens)}\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "aa3bfafe", | |
"metadata": {}, | |
"source": [ | |
"# Optimal model sizes for given \"H100 days\"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "9001fbd5", | |
"metadata": {}, | |
"source": [ | |
"Assuming we can use aH100 cluster for a certain number of days, how large should the model size be for training?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "4b536dd5", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/tmp/ipykernel_39501/1491758334.py:16: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n", | |
" return pd.DataFrame(rows).set_index(\"GPUs\").applymap(human_format)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>1 Days</th>\n", | |
" <th>2 Days</th>\n", | |
" <th>4 Days</th>\n", | |
" <th>8 Days</th>\n", | |
" <th>16 Days</th>\n", | |
" <th>32 Days</th>\n", | |
" <th>64 Days</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>GPUs</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>128</th>\n", | |
" <td>3.68 B</td>\n", | |
" <td>5.17 B</td>\n", | |
" <td>7.26 B</td>\n", | |
" <td>10.19 B</td>\n", | |
" <td>14.31 B</td>\n", | |
" <td>20.10 B</td>\n", | |
" <td>28.23 B</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>256</th>\n", | |
" <td>5.17 B</td>\n", | |
" <td>7.26 B</td>\n", | |
" <td>10.19 B</td>\n", | |
" <td>14.31 B</td>\n", | |
" <td>20.10 B</td>\n", | |
" <td>28.23 B</td>\n", | |
" <td>39.65 B</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>512</th>\n", | |
" <td>7.26 B</td>\n", | |
" <td>10.19 B</td>\n", | |
" <td>14.31 B</td>\n", | |
" <td>20.10 B</td>\n", | |
" <td>28.23 B</td>\n", | |
" <td>39.65 B</td>\n", | |
" <td>55.68 B</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1024</th>\n", | |
" <td>10.19 B</td>\n", | |
" <td>14.31 B</td>\n", | |
" <td>20.10 B</td>\n", | |
" <td>28.23 B</td>\n", | |
" <td>39.65 B</td>\n", | |
" <td>55.68 B</td>\n", | |
" <td>78.20 B</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2048</th>\n", | |
" <td>14.31 B</td>\n", | |
" <td>20.10 B</td>\n", | |
" <td>28.23 B</td>\n", | |
" <td>39.65 B</td>\n", | |
" <td>55.68 B</td>\n", | |
" <td>78.20 B</td>\n", | |
" <td>109.83 B</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4096</th>\n", | |
" <td>20.10 B</td>\n", | |
" <td>28.23 B</td>\n", | |
" <td>39.65 B</td>\n", | |
" <td>55.68 B</td>\n", | |
" <td>78.20 B</td>\n", | |
" <td>109.83 B</td>\n", | |
" <td>154.24 B</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" 1 Days 2 Days 4 Days 8 Days 16 Days 32 Days 64 Days\n", | |
"GPUs \n", | |
"128 3.68 B 5.17 B 7.26 B 10.19 B 14.31 B 20.10 B 28.23 B\n", | |
"256 5.17 B 7.26 B 10.19 B 14.31 B 20.10 B 28.23 B 39.65 B\n", | |
"512 7.26 B 10.19 B 14.31 B 20.10 B 28.23 B 39.65 B 55.68 B\n", | |
"1024 10.19 B 14.31 B 20.10 B 28.23 B 39.65 B 55.68 B 78.20 B\n", | |
"2048 14.31 B 20.10 B 28.23 B 39.65 B 55.68 B 78.20 B 109.83 B\n", | |
"4096 20.10 B 28.23 B 39.65 B 55.68 B 78.20 B 109.83 B 154.24 B" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"application/javascript": [ | |
"\n", | |
" setTimeout(function() {\n", | |
" var nbb_cell_id = 7;\n", | |
" var nbb_unformatted_code = \"# https://www.nvidia.com/en-us/data-center/h100/\\nflops_per_gpu_for = 1979e12, # 1979 TF/s (BFLOAT16 TensorCore, H100 SXM)\\nefficiency = 0.5 # 50% efficiency\\n\\n\\ndef create_table() -> None:\\n rows = []\\n for num_gpus in [128, 256, 512, 1024, 2048, 4096]:\\n row = {\\\"GPUs\\\": num_gpus}\\n for n_days in [1, 2, 4, 8, 16, 32, 64]:\\n flops_per_system = flops_per_gpu * num_gpus * efficiency\\n flops_total = flops_per_system * n_days * 24 * 60 * 60\\n row[f\\\"{n_days} Days\\\"] = get_optimal_num_params(flops_total)\\n rows.append(row)\\n\\n return pd.DataFrame(rows).set_index(\\\"GPUs\\\").applymap(human_format)\\n\\n\\ncreate_table()\";\n", | |
" var nbb_formatted_code = \"# https://www.nvidia.com/en-us/data-center/h100/\\nflops_per_gpu_for = (1979e12,) # 1979 TF/s (BFLOAT16 TensorCore, H100 SXM)\\nefficiency = 0.5 # 50% efficiency\\n\\n\\ndef create_table() -> None:\\n rows = []\\n for num_gpus in [128, 256, 512, 1024, 2048, 4096]:\\n row = {\\\"GPUs\\\": num_gpus}\\n for n_days in [1, 2, 4, 8, 16, 32, 64]:\\n flops_per_system = flops_per_gpu * num_gpus * efficiency\\n flops_total = flops_per_system * n_days * 24 * 60 * 60\\n row[f\\\"{n_days} Days\\\"] = get_optimal_num_params(flops_total)\\n rows.append(row)\\n\\n return pd.DataFrame(rows).set_index(\\\"GPUs\\\").applymap(human_format)\\n\\n\\ncreate_table()\";\n", | |
" var nbb_cells = Jupyter.notebook.get_cells();\n", | |
" for (var i = 0; i < nbb_cells.length; ++i) {\n", | |
" if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", | |
" if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", | |
" nbb_cells[i].set_text(nbb_formatted_code);\n", | |
" }\n", | |
" break;\n", | |
" }\n", | |
" }\n", | |
" }, 500);\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.Javascript object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# https://www.nvidia.com/en-us/data-center/h100/\n", | |
"flops_per_gpu_for = 1979e12, # 1979 TF/s (BFLOAT16 TensorCore, H100 SXM)\n", | |
"efficiency = 0.5 # 50% efficiency\n", | |
"\n", | |
"\n", | |
"def create_table() -> None:\n", | |
" rows = []\n", | |
" for num_gpus in [128, 256, 512, 1024, 2048, 4096]:\n", | |
" row = {\"GPUs\": num_gpus}\n", | |
" for n_days in [1, 2, 4, 8, 16, 32, 64]:\n", | |
" flops_per_system = flops_per_gpu * num_gpus * efficiency\n", | |
" flops_total = flops_per_system * n_days * 24 * 60 * 60\n", | |
" row[f\"{n_days} Days\"] = get_optimal_num_params(flops_total)\n", | |
" rows.append(row)\n", | |
"\n", | |
" return pd.DataFrame(rows).set_index(\"GPUs\").applymap(human_format)\n", | |
"\n", | |
"\n", | |
"create_table()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "ff5e23d3-530c-4112-8f0b-35fed831e79d", | |
"metadata": {}, | |
"source": [ | |
"For your information, the chinchilla optimal does not consider inference at all. In reality, it is more common to train a model that is several times smaller than the chinchilla optimal for a longer period of time." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "b32ff74f-6791-406e-88c6-9717474fcfda", | |
"metadata": {}, | |
"source": [ | |
"# Llama2 vs Chinchilla Optimal" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "7048314c-d7c4-45b7-a377-0c813ae536bd", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Size</th>\n", | |
" <th>Optimal tokens</th>\n", | |
" <th>Actual tokens</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>7.00 B</td>\n", | |
" <td>152.66 B</td>\n", | |
" <td>2 T</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>13.00 B</td>\n", | |
" <td>290.79 B</td>\n", | |
" <td>2 T</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>70.00 B</td>\n", | |
" <td>1.68 T</td>\n", | |
" <td>2 T</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Size Optimal tokens Actual tokens\n", | |
"0 7.00 B 152.66 B 2 T\n", | |
"1 13.00 B 290.79 B 2 T\n", | |
"2 70.00 B 1.68 T 2 T" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"application/javascript": [ | |
"\n", | |
" setTimeout(function() {\n", | |
" var nbb_cell_id = 8;\n", | |
" var nbb_unformatted_code = \"def create_llama2_table():\\n rows = []\\n for num_params in (7E9, 13E9, 70E9):\\n row = {\\n \\\"Size\\\": human_format(num_params),\\n \\\"Optimal tokens\\\": human_format(get_optimal_num_tokens(num_params)),\\n \\\"Actual tokens\\\": \\\"2 T\\\"\\n }\\n rows.append(row)\\n return pd.DataFrame(rows)\\n\\ncreate_llama2_table()\";\n", | |
" var nbb_formatted_code = \"def create_llama2_table():\\n rows = []\\n for num_params in (7e9, 13e9, 70e9):\\n row = {\\n \\\"Size\\\": human_format(num_params),\\n \\\"Optimal tokens\\\": human_format(get_optimal_num_tokens(num_params)),\\n \\\"Actual tokens\\\": \\\"2 T\\\",\\n }\\n rows.append(row)\\n return pd.DataFrame(rows)\\n\\n\\ncreate_llama2_table()\";\n", | |
" var nbb_cells = Jupyter.notebook.get_cells();\n", | |
" for (var i = 0; i < nbb_cells.length; ++i) {\n", | |
" if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", | |
" if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", | |
" nbb_cells[i].set_text(nbb_formatted_code);\n", | |
" }\n", | |
" break;\n", | |
" }\n", | |
" }\n", | |
" }, 500);\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.Javascript object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"def create_llama2_table():\n", | |
" rows = []\n", | |
" for num_params in (7E9, 13E9, 70E9):\n", | |
" row = {\n", | |
" \"Size\": human_format(num_params),\n", | |
" \"Optimal tokens\": human_format(get_optimal_num_tokens(num_params)),\n", | |
" \"Actual tokens\": \"2 T\"\n", | |
" }\n", | |
" rows.append(row)\n", | |
" return pd.DataFrame(rows)\n", | |
"\n", | |
"create_llama2_table()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment