Last active
March 15, 2024 15:04
-
-
Save Garstig/be3825e7a8dc4ffdb4b76a2509553081 to your computer and use it in GitHub Desktop.
Speetest LLAMA LookupDecoding.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"gpuType": "T4", | |
"authorship_tag": "ABX9TyOush5N5IhCtLKyFy+o6tez", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
}, | |
"accelerator": "GPU" | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/Garstig/be3825e7a8dc4ffdb4b76a2509553081/speetest-llama.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "HwTl8I3wbl6v", | |
"outputId": "dcb5bed1-22d5-47fd-f611-a32400d192c3" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"--2024-03-15 14:22:19-- https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf?download=true\n", | |
"Resolving huggingface.co (huggingface.co)... 13.33.33.110, 13.33.33.20, 13.33.33.55, ...\n", | |
"Connecting to huggingface.co (huggingface.co)|13.33.33.110|:443... connected.\n", | |
"HTTP request sent, awaiting response... 302 Found\n", | |
"Location: https://cdn-lfs-us-1.huggingface.co/repos/72/62/726219e98582d16c24a66629a4dec1b0761b91c918e15dea2625b4293c134a92/3e0039fd0273fcbebb49228943b17831aadd55cbcbf56f0af00499be2040ccf9?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27mistral-7b-instruct-v0.2.Q4_K_M.gguf%3B+filename%3D%22mistral-7b-instruct-v0.2.Q4_K_M.gguf%22%3B&Expires=1710771740&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxMDc3MTc0MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzcyLzYyLzcyNjIxOWU5ODU4MmQxNmMyNGE2NjYyOWE0ZGVjMWIwNzYxYjkxYzkxOGUxNWRlYTI2MjViNDI5M2MxMzRhOTIvM2UwMDM5ZmQwMjczZmNiZWJiNDkyMjg5NDNiMTc4MzFhYWRkNTVjYmNiZjU2ZjBhZjAwNDk5YmUyMDQwY2NmOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=DwxjckiCSLZxRbJMcJqoFrILNQeoCHE0PNPasxh0cYe36C239tf1pIPUIZJFcfIi9a7SIDkaRGqKZs%7E6kkf-pNp7zhRKvrmHdJ1bryhxLK16RgTzf5AdpybJTq8lDr8GTPLU4adrQ3mfM9kLFTlS88ADHCjzUgi1p7oOiPgDTY3mNHzsO9PYEfKYeHod67OljkGYh-wmHELAkN08CWcrsbMOhDQsabuamIeyytjOGCco9hc9JymeX1AubfRcqrP1PdSIYASTggquqntvYijucoPAAJPYm1AC2nIv-SLplw2R2%7ELgXqilzM%7Ep7266XZHCbP0sCSdErqZJcC10%7EvBCdQ__&Key-Pair-Id=KCD77M1F0VK2B [following]\n", | |
"--2024-03-15 14:22:20-- https://cdn-lfs-us-1.huggingface.co/repos/72/62/726219e98582d16c24a66629a4dec1b0761b91c918e15dea2625b4293c134a92/3e0039fd0273fcbebb49228943b17831aadd55cbcbf56f0af00499be2040ccf9?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27mistral-7b-instruct-v0.2.Q4_K_M.gguf%3B+filename%3D%22mistral-7b-instruct-v0.2.Q4_K_M.gguf%22%3B&Expires=1710771740&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxMDc3MTc0MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzcyLzYyLzcyNjIxOWU5ODU4MmQxNmMyNGE2NjYyOWE0ZGVjMWIwNzYxYjkxYzkxOGUxNWRlYTI2MjViNDI5M2MxMzRhOTIvM2UwMDM5ZmQwMjczZmNiZWJiNDkyMjg5NDNiMTc4MzFhYWRkNTVjYmNiZjU2ZjBhZjAwNDk5YmUyMDQwY2NmOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=DwxjckiCSLZxRbJMcJqoFrILNQeoCHE0PNPasxh0cYe36C239tf1pIPUIZJFcfIi9a7SIDkaRGqKZs%7E6kkf-pNp7zhRKvrmHdJ1bryhxLK16RgTzf5AdpybJTq8lDr8GTPLU4adrQ3mfM9kLFTlS88ADHCjzUgi1p7oOiPgDTY3mNHzsO9PYEfKYeHod67OljkGYh-wmHELAkN08CWcrsbMOhDQsabuamIeyytjOGCco9hc9JymeX1AubfRcqrP1PdSIYASTggquqntvYijucoPAAJPYm1AC2nIv-SLplw2R2%7ELgXqilzM%7Ep7266XZHCbP0sCSdErqZJcC10%7EvBCdQ__&Key-Pair-Id=KCD77M1F0VK2B\n", | |
"Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 108.157.254.127, 108.157.254.63, 108.157.254.89, ...\n", | |
"Connecting to cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)|108.157.254.127|:443... connected.\n", | |
"HTTP request sent, awaiting response... 200 OK\n", | |
"Length: 4368439584 (4.1G) [binary/octet-stream]\n", | |
"Saving to: ‘mistral-7b-instruct-v0.2.Q4_K_M.gguf?download=true’\n", | |
"\n", | |
"mistral-7b-instruct 100%[===================>] 4.07G 230MB/s in 22s \n", | |
"\n", | |
"2024-03-15 14:22:42 (191 MB/s) - ‘mistral-7b-instruct-v0.2.Q4_K_M.gguf?download=true’ saved [4368439584/4368439584]\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"!wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf?download=true" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!mv mistral-7b-instruct-v0.2.Q4_K_M.gguf?download=true mistral-7b-instruct-v0.2.Q4_K_M.gguf" | |
], | |
"metadata": { | |
"id": "_3Mw1FBUc8AP" | |
}, | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#!CMAKE_ARGS=\"-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS\" pip install llama-cpp-python" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "CVP9bquBcg5n", | |
"outputId": "17b6af53-eb7d-4a2f-fb00-83485a538783" | |
}, | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Collecting llama-cpp-python\n", | |
" Downloading llama_cpp_python-0.2.56.tar.gz (36.9 MB)\n", | |
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.9/36.9 MB\u001b[0m \u001b[31m15.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
"\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", | |
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", | |
" Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n", | |
" Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", | |
"Requirement already satisfied: typing-extensions>=4.5.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (4.10.0)\n", | |
"Requirement already satisfied: numpy>=1.20.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (1.25.2)\n", | |
"Collecting diskcache>=5.6.1 (from llama-cpp-python)\n", | |
" Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)\n", | |
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.5/45.5 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
"\u001b[?25hRequirement already satisfied: jinja2>=2.11.3 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (3.1.3)\n", | |
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2>=2.11.3->llama-cpp-python) (2.1.5)\n", | |
"Building wheels for collected packages: llama-cpp-python\n", | |
" Building wheel for llama-cpp-python (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", | |
" Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.56-cp310-cp310-manylinux_2_35_x86_64.whl size=2841182 sha256=1e91e97a338bc2daa1cc72d23c4f0d41ed972b1e0e4796d73e3a7e28b178f077\n", | |
" Stored in directory: /root/.cache/pip/wheels/e5/09/9d/c413053f6258cb2546cc792418c595e276f9efd5db31a80377\n", | |
"Successfully built llama-cpp-python\n", | |
"Installing collected packages: diskcache, llama-cpp-python\n", | |
"Successfully installed diskcache-5.6.3 llama-cpp-python-0.2.56\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!CMAKE_ARGS=\"-DLLAMA_CUBLAS=on\" FORCE_CMAKE=1 pip install llama-cpp-python" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Tz6FJpOufWXL", | |
"outputId": "8ce6419b-1e6c-4a09-959b-be199a513111" | |
}, | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Collecting llama-cpp-python\n", | |
" Downloading llama_cpp_python-0.2.56.tar.gz (36.9 MB)\n", | |
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.9/36.9 MB\u001b[0m \u001b[31m17.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
"\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", | |
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", | |
" Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n", | |
" Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", | |
"Requirement already satisfied: typing-extensions>=4.5.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (4.10.0)\n", | |
"Requirement already satisfied: numpy>=1.20.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (1.25.2)\n", | |
"Collecting diskcache>=5.6.1 (from llama-cpp-python)\n", | |
" Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)\n", | |
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.5/45.5 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
"\u001b[?25hRequirement already satisfied: jinja2>=2.11.3 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (3.1.3)\n", | |
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2>=2.11.3->llama-cpp-python) (2.1.5)\n", | |
"Building wheels for collected packages: llama-cpp-python\n", | |
" Building wheel for llama-cpp-python (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", | |
" Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.56-cp310-cp310-manylinux_2_35_x86_64.whl size=26291647 sha256=4d08ee7d43ebc79a5c7c0d302e11645c6a07c79c5e1c2391514742ed1842af65\n", | |
" Stored in directory: /root/.cache/pip/wheels/e5/09/9d/c413053f6258cb2546cc792418c595e276f9efd5db31a80377\n", | |
"Successfully built llama-cpp-python\n", | |
"Installing collected packages: diskcache, llama-cpp-python\n", | |
"Successfully installed diskcache-5.6.3 llama-cpp-python-0.2.56\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from llama_cpp import Llama\n", | |
"import json\n", | |
"from llama_cpp.llama_speculative import LlamaPromptLookupDecoding" | |
], | |
"metadata": { | |
"id": "piOITZL8bshq" | |
}, | |
"execution_count": 4, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"model_path = \"mistral-7b-instruct-v0.2.Q4_K_M.gguf\"\n" | |
], | |
"metadata": { | |
"id": "CmA0k1DFcJJ5" | |
}, | |
"execution_count": 5, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!ls" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "HzAhT3sOdGS4", | |
"outputId": "a5636d18-a4fd-4b5e-ccb3-dcd0f3943cbc" | |
}, | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"mistral-7b-instruct-v0.2.Q4_K_M.gguf sample_data\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!sha256sum mistral-7b-instruct-v0.2.Q4_K_M.gguf" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "V-1QMKA3d4_u", | |
"outputId": "10f59e33-f863-415f-ad99-b49a71bf4e85" | |
}, | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"3e0039fd0273fcbebb49228943b17831aadd55cbcbf56f0af00499be2040ccf9 mistral-7b-instruct-v0.2.Q4_K_M.gguf\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"llm_model = Llama(\n", | |
" model_path=model_path,\n", | |
" n_gpu_layers=32,\n", | |
" n_ctx=2048,\n", | |
" chat_format=\"chatml\",\n", | |
" n_threads = 8,\n", | |
" draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines.\n", | |
" )" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "YXJuz5P7ct_C", | |
"outputId": "d8a45718-cfb2-49ed-c42e-18109b9ac661" | |
}, | |
"execution_count": 18, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from mistral-7b-instruct-v0.2.Q4_K_M.gguf (version GGUF V3 (latest))\n", | |
"llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", | |
"llama_model_loader: - kv 0: general.architecture str = llama\n", | |
"llama_model_loader: - kv 1: general.name str = mistralai_mistral-7b-instruct-v0.2\n", | |
"llama_model_loader: - kv 2: llama.context_length u32 = 32768\n", | |
"llama_model_loader: - kv 3: llama.embedding_length u32 = 4096\n", | |
"llama_model_loader: - kv 4: llama.block_count u32 = 32\n", | |
"llama_model_loader: - kv 5: llama.feed_forward_length u32 = 14336\n", | |
"llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128\n", | |
"llama_model_loader: - kv 7: llama.attention.head_count u32 = 32\n", | |
"llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 8\n", | |
"llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010\n", | |
"llama_model_loader: - kv 10: llama.rope.freq_base f32 = 1000000.000000\n", | |
"llama_model_loader: - kv 11: general.file_type u32 = 15\n", | |
"llama_model_loader: - kv 12: tokenizer.ggml.model str = llama\n", | |
"llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,32000] = [\"<unk>\", \"<s>\", \"</s>\", \"<0x00>\", \"<...\n", | |
"llama_model_loader: - kv 14: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000...\n", | |
"llama_model_loader: - kv 15: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n", | |
"llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 1\n", | |
"llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 2\n", | |
"llama_model_loader: - kv 18: tokenizer.ggml.unknown_token_id u32 = 0\n", | |
"llama_model_loader: - kv 19: tokenizer.ggml.padding_token_id u32 = 0\n", | |
"llama_model_loader: - kv 20: tokenizer.ggml.add_bos_token bool = true\n", | |
"llama_model_loader: - kv 21: tokenizer.ggml.add_eos_token bool = false\n", | |
"llama_model_loader: - kv 22: tokenizer.chat_template str = {{ bos_token }}{% for message in mess...\n", | |
"llama_model_loader: - kv 23: general.quantization_version u32 = 2\n", | |
"llama_model_loader: - type f32: 65 tensors\n", | |
"llama_model_loader: - type q4_K: 193 tensors\n", | |
"llama_model_loader: - type q6_K: 33 tensors\n", | |
"llm_load_vocab: special tokens definition check successful ( 259/32000 ).\n", | |
"llm_load_print_meta: format = GGUF V3 (latest)\n", | |
"llm_load_print_meta: arch = llama\n", | |
"llm_load_print_meta: vocab type = SPM\n", | |
"llm_load_print_meta: n_vocab = 32000\n", | |
"llm_load_print_meta: n_merges = 0\n", | |
"llm_load_print_meta: n_ctx_train = 32768\n", | |
"llm_load_print_meta: n_embd = 4096\n", | |
"llm_load_print_meta: n_head = 32\n", | |
"llm_load_print_meta: n_head_kv = 8\n", | |
"llm_load_print_meta: n_layer = 32\n", | |
"llm_load_print_meta: n_rot = 128\n", | |
"llm_load_print_meta: n_embd_head_k = 128\n", | |
"llm_load_print_meta: n_embd_head_v = 128\n", | |
"llm_load_print_meta: n_gqa = 4\n", | |
"llm_load_print_meta: n_embd_k_gqa = 1024\n", | |
"llm_load_print_meta: n_embd_v_gqa = 1024\n", | |
"llm_load_print_meta: f_norm_eps = 0.0e+00\n", | |
"llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n", | |
"llm_load_print_meta: f_clamp_kqv = 0.0e+00\n", | |
"llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n", | |
"llm_load_print_meta: n_ff = 14336\n", | |
"llm_load_print_meta: n_expert = 0\n", | |
"llm_load_print_meta: n_expert_used = 0\n", | |
"llm_load_print_meta: pooling type = 0\n", | |
"llm_load_print_meta: rope type = 0\n", | |
"llm_load_print_meta: rope scaling = linear\n", | |
"llm_load_print_meta: freq_base_train = 1000000.0\n", | |
"llm_load_print_meta: freq_scale_train = 1\n", | |
"llm_load_print_meta: n_yarn_orig_ctx = 32768\n", | |
"llm_load_print_meta: rope_finetuned = unknown\n", | |
"llm_load_print_meta: ssm_d_conv = 0\n", | |
"llm_load_print_meta: ssm_d_inner = 0\n", | |
"llm_load_print_meta: ssm_d_state = 0\n", | |
"llm_load_print_meta: ssm_dt_rank = 0\n", | |
"llm_load_print_meta: model type = 7B\n", | |
"llm_load_print_meta: model ftype = Q4_K - Medium\n", | |
"llm_load_print_meta: model params = 7.24 B\n", | |
"llm_load_print_meta: model size = 4.07 GiB (4.83 BPW) \n", | |
"llm_load_print_meta: general.name = mistralai_mistral-7b-instruct-v0.2\n", | |
"llm_load_print_meta: BOS token = 1 '<s>'\n", | |
"llm_load_print_meta: EOS token = 2 '</s>'\n", | |
"llm_load_print_meta: UNK token = 0 '<unk>'\n", | |
"llm_load_print_meta: PAD token = 0 '<unk>'\n", | |
"llm_load_print_meta: LF token = 13 '<0x0A>'\n", | |
"llm_load_tensors: ggml ctx size = 0.22 MiB\n", | |
"llm_load_tensors: offloading 32 repeating layers to GPU\n", | |
"llm_load_tensors: offloaded 32/33 layers to GPU\n", | |
"llm_load_tensors: CPU buffer size = 4165.37 MiB\n", | |
"llm_load_tensors: CUDA0 buffer size = 3992.50 MiB\n", | |
".................................................................................................\n", | |
"llama_new_context_with_model: n_ctx = 2048\n", | |
"llama_new_context_with_model: freq_base = 1000000.0\n", | |
"llama_new_context_with_model: freq_scale = 1\n", | |
"llama_kv_cache_init: CUDA0 KV buffer size = 256.00 MiB\n", | |
"llama_new_context_with_model: KV self size = 256.00 MiB, K (f16): 128.00 MiB, V (f16): 128.00 MiB\n", | |
"llama_new_context_with_model: CUDA_Host input buffer size = 13.02 MiB\n", | |
"llama_new_context_with_model: CUDA0 compute buffer size = 164.00 MiB\n", | |
"llama_new_context_with_model: CUDA_Host compute buffer size = 78.50 MiB\n", | |
"llama_new_context_with_model: graph splits (measure): 3\n", | |
"AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | \n", | |
"Model metadata: {'tokenizer.chat_template': \"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}\", 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '1000000.000000', 'llama.context_length': '32768', 'general.name': 'mistralai_mistral-7b-instruct-v0.2', 'tokenizer.ggml.add_bos_token': 'true', 'llama.embedding_length': '4096', 'llama.feed_forward_length': '14336', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.dimension_count': '128', 'tokenizer.ggml.bos_token_id': '1', 'llama.attention.head_count': '32', 'llama.block_count': '32', 'llama.attention.head_count_kv': '8', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'llama', 'general.file_type': '15'}\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"llm_model.verbose = False" | |
], | |
"metadata": { | |
"id": "ffZ88Q3veVa3" | |
}, | |
"execution_count": 19, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Time with LlamaPromptLookupDecoding" | |
], | |
"metadata": { | |
"id": "W-i9-md8om14" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import time\n", | |
"import statistics\n", | |
"\n", | |
"messages = [\n", | |
" {\n", | |
" \"role\": \"system\",\n", | |
" \"content\": \"You are a helpful assistant that outputs in JSON.\",\n", | |
" }\n", | |
"]\n", | |
"\n", | |
"persons = [\n", | |
" \"Mozart\",\n", | |
" \"Einstein\",\n", | |
" \"Beethoven\",\n", | |
" \"Newton\",\n", | |
" \"Shakespeare\",\n", | |
" \"Galileo\",\n", | |
" \"Da Vinci\",\n", | |
" \"Tesla\",\n", | |
" \"Curie\",\n", | |
" \"Bach\",\n", | |
" \"Picasso\",\n", | |
" \"Gutenberg\",\n", | |
" \"Darwin\",\n", | |
" \"Hemingway\",\n", | |
" \"Franklin\",\n", | |
" \"Hawking\",\n", | |
" \"Turing\",\n", | |
" \"Chopin\",\n", | |
" \"Gandhi\",\n", | |
" \"Lennon\"\n", | |
"]\n", | |
"\n", | |
"response_format = {\n", | |
" \"type\": \"json_object\",\n", | |
" \"schema\": {\n", | |
" \"type\": \"object\",\n", | |
" \"properties\": {\n", | |
" \"born_in_Germany\": {\"type\": \"boolean\"},\n", | |
" },\n", | |
" \"required\": [\"born_in_Germany\"],\n", | |
" },\n", | |
"}\n", | |
"\n", | |
"times = []\n", | |
"outputs = []\n", | |
"for person in persons:\n", | |
" start_time = time.time()\n", | |
" messages.append({\"role\": \"user\", \"content\": f\"Was the following person born in Germany? person: {person}\"})\n", | |
" output = llm_model.create_chat_completion(\n", | |
" messages=messages,\n", | |
" response_format=response_format,\n", | |
" temperature=0.7,\n", | |
" )\n", | |
" end_time = time.time()\n", | |
" elapsed_time = end_time - start_time\n", | |
" outputs.append(output)\n", | |
" times.append(elapsed_time)\n", | |
" messages.pop() # Remove the user message for the next iteration\n", | |
"\n", | |
"# Print statistics\n", | |
"print(\"Time Statistics:\")\n", | |
"print(\"Minimum time:\", min(times))\n", | |
"print(\"Maximum time:\", max(times))\n", | |
"print(\"Average time:\", statistics.mean(times))\n", | |
"print(\"Median time:\", statistics.median(times))\n", | |
"print(\"Standard Deviation:\", statistics.stdev(times))\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "nLoHt3r4cueq", | |
"outputId": "71ed94bb-4063-47d5-f10c-619612ad5f7d" | |
}, | |
"execution_count": 20, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Time Statistics:\n", | |
"Minimum time: 2.17183518409729\n", | |
"Maximum time: 3.079291343688965\n", | |
"Average time: 2.4745811223983765\n", | |
"Median time: 2.330846905708313\n", | |
"Standard Deviation: 0.2737162183714513\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Time without LlamaPromptLookupDecoding" | |
], | |
"metadata": { | |
"id": "MBVbhQlBowyV" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"llm_model.draft_model = None" | |
], | |
"metadata": { | |
"id": "IsblXWnTeZsY" | |
}, | |
"execution_count": 21, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import time\n", | |
"import statistics\n", | |
"\n", | |
"messages = [\n", | |
" {\n", | |
" \"role\": \"system\",\n", | |
" \"content\": \"You are a helpful assistant that outputs in JSON.\",\n", | |
" }\n", | |
"]\n", | |
"\n", | |
"persons = [\n", | |
" \"Mozart\",\n", | |
" \"Einstein\",\n", | |
" \"Beethoven\",\n", | |
" \"Newton\",\n", | |
" \"Shakespeare\",\n", | |
" \"Galileo\",\n", | |
" \"Da Vinci\",\n", | |
" \"Tesla\",\n", | |
" \"Curie\",\n", | |
" \"Bach\",\n", | |
" \"Picasso\",\n", | |
" \"Gutenberg\",\n", | |
" \"Darwin\",\n", | |
" \"Hemingway\",\n", | |
" \"Franklin\",\n", | |
" \"Hawking\",\n", | |
" \"Turing\",\n", | |
" \"Chopin\",\n", | |
" \"Gandhi\",\n", | |
" \"Lennon\"\n", | |
"]\n", | |
"\n", | |
"response_format = {\n", | |
" \"type\": \"json_object\",\n", | |
" \"schema\": {\n", | |
" \"type\": \"object\",\n", | |
" \"properties\": {\n", | |
" \"born_in_Germany\": {\"type\": \"boolean\"},\n", | |
" },\n", | |
" \"required\": [\"born_in_Germany\"],\n", | |
" },\n", | |
"}\n", | |
"\n", | |
"times = []\n", | |
"outputs = []\n", | |
"for person in persons:\n", | |
" start_time = time.time()\n", | |
" messages.append({\"role\": \"user\", \"content\": f\"Was the following person born in Germany? person: {person}\"})\n", | |
" output = llm_model.create_chat_completion(\n", | |
" messages=messages,\n", | |
" response_format=response_format,\n", | |
" temperature=0.7,\n", | |
" )\n", | |
" end_time = time.time()\n", | |
" elapsed_time = end_time - start_time\n", | |
" outputs.append(output)\n", | |
" times.append(elapsed_time)\n", | |
" messages.pop() # Remove the user message for the next iteration\n", | |
"\n", | |
"# Print statistics\n", | |
"print(\"Time Statistics:\")\n", | |
"print(\"Minimum time:\", min(times))\n", | |
"print(\"Maximum time:\", max(times))\n", | |
"print(\"Average time:\", statistics.mean(times))\n", | |
"print(\"Median time:\", statistics.median(times))\n", | |
"print(\"Standard Deviation:\", statistics.stdev(times))\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "wKvR5-aOl17U", | |
"outputId": "b149937b-2b43-4f57-fd4d-3d51fef9aa69" | |
}, | |
"execution_count": 22, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Time Statistics:\n", | |
"Minimum time: 1.9035820960998535\n", | |
"Maximum time: 2.636012077331543\n", | |
"Average time: 2.138879108428955\n", | |
"Median time: 2.0756272077560425\n", | |
"Standard Deviation: 0.19751100449869458\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"print(outputs)" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "ZrjGJpQ3mYZP", | |
"outputId": "59071388-9276-470e-fce3-05dd51711700" | |
}, | |
"execution_count": 14, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"[{'id': 'chatcmpl-129dcc16-fc5b-4a34-9f6b-8e4e4330f3e4', 'object': 'chat.completion', 'created': 1710514210, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 13, 'total_tokens': 79}}, {'id': 'chatcmpl-31159573-d1c5-43cd-b2d1-bf96a1ed97cf', 'object': 'chat.completion', 'created': 1710514212, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": true } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 65, 'completion_tokens': 13, 'total_tokens': 78}}, {'id': 'chatcmpl-dc8117a1-a6d6-4881-9ea5-6a605abbbed1', 'object': 'chat.completion', 'created': 1710514215, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": true } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 13, 'total_tokens': 80}}, {'id': 'chatcmpl-c41a65c4-5830-4855-bbe2-b16b5fd975a2', 'object': 'chat.completion', 'created': 1710514217, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\": false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 65, 'completion_tokens': 12, 'total_tokens': 77}}, {'id': 'chatcmpl-865d3251-36a0-4297-b0ae-1d93845fefb3', 'object': 'chat.completion', 'created': 1710514219, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\": false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 65, 'completion_tokens': 12, 'total_tokens': 77}}, {'id': 'chatcmpl-3ae5ecde-d0aa-4396-8df6-f2c3a544e62b', 'object': 'chat.completion', 'created': 1710514221, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\": false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 12, 'total_tokens': 79}}, {'id': 'chatcmpl-d1d4f661-31fa-43ef-8f3a-de822f58c6c9', 'object': 'chat.completion', 'created': 1710514223, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 13, 'total_tokens': 80}}, {'id': 'chatcmpl-0072defb-2b67-495c-9c5c-8c87ba00f219', 'object': 'chat.completion', 'created': 1710514225, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 13, 'total_tokens': 79}}, {'id': 'chatcmpl-2f9ef1cf-f27f-4c7f-b9fe-a66032889961', 'object': 'chat.completion', 'created': 1710514228, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 13, 'total_tokens': 79}}, {'id': 'chatcmpl-6f22c04e-8cbd-429a-80bc-1d8239e631e6', 'object': 'chat.completion', 'created': 1710514230, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": true } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 65, 'completion_tokens': 13, 'total_tokens': 78}}, {'id': 'chatcmpl-897602ff-aa5b-4e6d-aded-a33cb3fad259', 'object': 'chat.completion', 'created': 1710514232, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 13, 'total_tokens': 79}}, {'id': 'chatcmpl-823ef398-bea2-4c90-af3c-0aa0b1c80969', 'object': 'chat.completion', 'created': 1710514234, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": true } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 13, 'total_tokens': 80}}, {'id': 'chatcmpl-06071170-4683-4638-bb02-7942482ab308', 'object': 'chat.completion', 'created': 1710514236, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 65, 'completion_tokens': 13, 'total_tokens': 78}}, {'id': 'chatcmpl-4949dc54-45ae-47b6-b4d3-764d2677e3bd', 'object': 'chat.completion', 'created': 1710514238, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\": false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 12, 'total_tokens': 79}}, {'id': 'chatcmpl-5370272a-b44f-4f33-b59d-a475131945c9', 'object': 'chat.completion', 'created': 1710514241, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 65, 'completion_tokens': 13, 'total_tokens': 78}}, {'id': 'chatcmpl-f3c18cd5-7df6-403d-afbf-d88aabb6ab25', 'object': 'chat.completion', 'created': 1710514243, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\": false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 12, 'total_tokens': 78}}, {'id': 'chatcmpl-d3a18fb5-e1d5-42b9-88f9-1942e819c552', 'object': 'chat.completion', 'created': 1710514245, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 13, 'total_tokens': 79}}, {'id': 'chatcmpl-dd064549-c240-4c42-a80d-fb99e75133d7', 'object': 'chat.completion', 'created': 1710514247, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\":false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 13, 'total_tokens': 80}}, {'id': 'chatcmpl-10c5933b-5796-4317-9207-560b706da4bf', 'object': 'chat.completion', 'created': 1710514249, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\": false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 12, 'total_tokens': 78}}, {'id': 'chatcmpl-52cbc08d-1c5b-4e2d-8873-55d182a2452e', 'object': 'chat.completion', 'created': 1710514251, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 13, 'total_tokens': 80}}]\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [], | |
"metadata": { | |
"id": "Gxx7MbrsmZ-W" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment