Skip to content

Instantly share code, notes, and snippets.

@Garstig
Last active March 15, 2024 15:04
Show Gist options
  • Save Garstig/be3825e7a8dc4ffdb4b76a2509553081 to your computer and use it in GitHub Desktop.
Save Garstig/be3825e7a8dc4ffdb4b76a2509553081 to your computer and use it in GitHub Desktop.
Speetest LLAMA LookupDecoding.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4",
"authorship_tag": "ABX9TyOush5N5IhCtLKyFy+o6tez",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/Garstig/be3825e7a8dc4ffdb4b76a2509553081/speetest-llama.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "HwTl8I3wbl6v",
"outputId": "dcb5bed1-22d5-47fd-f611-a32400d192c3"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"--2024-03-15 14:22:19-- https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf?download=true\n",
"Resolving huggingface.co (huggingface.co)... 13.33.33.110, 13.33.33.20, 13.33.33.55, ...\n",
"Connecting to huggingface.co (huggingface.co)|13.33.33.110|:443... connected.\n",
"HTTP request sent, awaiting response... 302 Found\n",
"Location: https://cdn-lfs-us-1.huggingface.co/repos/72/62/726219e98582d16c24a66629a4dec1b0761b91c918e15dea2625b4293c134a92/3e0039fd0273fcbebb49228943b17831aadd55cbcbf56f0af00499be2040ccf9?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27mistral-7b-instruct-v0.2.Q4_K_M.gguf%3B+filename%3D%22mistral-7b-instruct-v0.2.Q4_K_M.gguf%22%3B&Expires=1710771740&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxMDc3MTc0MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzcyLzYyLzcyNjIxOWU5ODU4MmQxNmMyNGE2NjYyOWE0ZGVjMWIwNzYxYjkxYzkxOGUxNWRlYTI2MjViNDI5M2MxMzRhOTIvM2UwMDM5ZmQwMjczZmNiZWJiNDkyMjg5NDNiMTc4MzFhYWRkNTVjYmNiZjU2ZjBhZjAwNDk5YmUyMDQwY2NmOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=DwxjckiCSLZxRbJMcJqoFrILNQeoCHE0PNPasxh0cYe36C239tf1pIPUIZJFcfIi9a7SIDkaRGqKZs%7E6kkf-pNp7zhRKvrmHdJ1bryhxLK16RgTzf5AdpybJTq8lDr8GTPLU4adrQ3mfM9kLFTlS88ADHCjzUgi1p7oOiPgDTY3mNHzsO9PYEfKYeHod67OljkGYh-wmHELAkN08CWcrsbMOhDQsabuamIeyytjOGCco9hc9JymeX1AubfRcqrP1PdSIYASTggquqntvYijucoPAAJPYm1AC2nIv-SLplw2R2%7ELgXqilzM%7Ep7266XZHCbP0sCSdErqZJcC10%7EvBCdQ__&Key-Pair-Id=KCD77M1F0VK2B [following]\n",
"--2024-03-15 14:22:20-- https://cdn-lfs-us-1.huggingface.co/repos/72/62/726219e98582d16c24a66629a4dec1b0761b91c918e15dea2625b4293c134a92/3e0039fd0273fcbebb49228943b17831aadd55cbcbf56f0af00499be2040ccf9?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27mistral-7b-instruct-v0.2.Q4_K_M.gguf%3B+filename%3D%22mistral-7b-instruct-v0.2.Q4_K_M.gguf%22%3B&Expires=1710771740&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxMDc3MTc0MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzcyLzYyLzcyNjIxOWU5ODU4MmQxNmMyNGE2NjYyOWE0ZGVjMWIwNzYxYjkxYzkxOGUxNWRlYTI2MjViNDI5M2MxMzRhOTIvM2UwMDM5ZmQwMjczZmNiZWJiNDkyMjg5NDNiMTc4MzFhYWRkNTVjYmNiZjU2ZjBhZjAwNDk5YmUyMDQwY2NmOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=DwxjckiCSLZxRbJMcJqoFrILNQeoCHE0PNPasxh0cYe36C239tf1pIPUIZJFcfIi9a7SIDkaRGqKZs%7E6kkf-pNp7zhRKvrmHdJ1bryhxLK16RgTzf5AdpybJTq8lDr8GTPLU4adrQ3mfM9kLFTlS88ADHCjzUgi1p7oOiPgDTY3mNHzsO9PYEfKYeHod67OljkGYh-wmHELAkN08CWcrsbMOhDQsabuamIeyytjOGCco9hc9JymeX1AubfRcqrP1PdSIYASTggquqntvYijucoPAAJPYm1AC2nIv-SLplw2R2%7ELgXqilzM%7Ep7266XZHCbP0sCSdErqZJcC10%7EvBCdQ__&Key-Pair-Id=KCD77M1F0VK2B\n",
"Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 108.157.254.127, 108.157.254.63, 108.157.254.89, ...\n",
"Connecting to cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)|108.157.254.127|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 4368439584 (4.1G) [binary/octet-stream]\n",
"Saving to: ‘mistral-7b-instruct-v0.2.Q4_K_M.gguf?download=true’\n",
"\n",
"mistral-7b-instruct 100%[===================>] 4.07G 230MB/s in 22s \n",
"\n",
"2024-03-15 14:22:42 (191 MB/s) - ‘mistral-7b-instruct-v0.2.Q4_K_M.gguf?download=true’ saved [4368439584/4368439584]\n",
"\n"
]
}
],
"source": [
"!wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf?download=true"
]
},
{
"cell_type": "code",
"source": [
"!mv mistral-7b-instruct-v0.2.Q4_K_M.gguf?download=true mistral-7b-instruct-v0.2.Q4_K_M.gguf"
],
"metadata": {
"id": "_3Mw1FBUc8AP"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#!CMAKE_ARGS=\"-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS\" pip install llama-cpp-python"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "CVP9bquBcg5n",
"outputId": "17b6af53-eb7d-4a2f-fb00-83485a538783"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting llama-cpp-python\n",
" Downloading llama_cpp_python-0.2.56.tar.gz (36.9 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.9/36.9 MB\u001b[0m \u001b[31m15.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
" Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n",
" Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: typing-extensions>=4.5.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (4.10.0)\n",
"Requirement already satisfied: numpy>=1.20.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (1.25.2)\n",
"Collecting diskcache>=5.6.1 (from llama-cpp-python)\n",
" Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.5/45.5 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: jinja2>=2.11.3 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (3.1.3)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2>=2.11.3->llama-cpp-python) (2.1.5)\n",
"Building wheels for collected packages: llama-cpp-python\n",
" Building wheel for llama-cpp-python (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.56-cp310-cp310-manylinux_2_35_x86_64.whl size=2841182 sha256=1e91e97a338bc2daa1cc72d23c4f0d41ed972b1e0e4796d73e3a7e28b178f077\n",
" Stored in directory: /root/.cache/pip/wheels/e5/09/9d/c413053f6258cb2546cc792418c595e276f9efd5db31a80377\n",
"Successfully built llama-cpp-python\n",
"Installing collected packages: diskcache, llama-cpp-python\n",
"Successfully installed diskcache-5.6.3 llama-cpp-python-0.2.56\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!CMAKE_ARGS=\"-DLLAMA_CUBLAS=on\" FORCE_CMAKE=1 pip install llama-cpp-python"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Tz6FJpOufWXL",
"outputId": "8ce6419b-1e6c-4a09-959b-be199a513111"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting llama-cpp-python\n",
" Downloading llama_cpp_python-0.2.56.tar.gz (36.9 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.9/36.9 MB\u001b[0m \u001b[31m17.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
" Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n",
" Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: typing-extensions>=4.5.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (4.10.0)\n",
"Requirement already satisfied: numpy>=1.20.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (1.25.2)\n",
"Collecting diskcache>=5.6.1 (from llama-cpp-python)\n",
" Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.5/45.5 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: jinja2>=2.11.3 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (3.1.3)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2>=2.11.3->llama-cpp-python) (2.1.5)\n",
"Building wheels for collected packages: llama-cpp-python\n",
" Building wheel for llama-cpp-python (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.56-cp310-cp310-manylinux_2_35_x86_64.whl size=26291647 sha256=4d08ee7d43ebc79a5c7c0d302e11645c6a07c79c5e1c2391514742ed1842af65\n",
" Stored in directory: /root/.cache/pip/wheels/e5/09/9d/c413053f6258cb2546cc792418c595e276f9efd5db31a80377\n",
"Successfully built llama-cpp-python\n",
"Installing collected packages: diskcache, llama-cpp-python\n",
"Successfully installed diskcache-5.6.3 llama-cpp-python-0.2.56\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from llama_cpp import Llama\n",
"import json\n",
"from llama_cpp.llama_speculative import LlamaPromptLookupDecoding"
],
"metadata": {
"id": "piOITZL8bshq"
},
"execution_count": 4,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model_path = \"mistral-7b-instruct-v0.2.Q4_K_M.gguf\"\n"
],
"metadata": {
"id": "CmA0k1DFcJJ5"
},
"execution_count": 5,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!ls"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "HzAhT3sOdGS4",
"outputId": "a5636d18-a4fd-4b5e-ccb3-dcd0f3943cbc"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"mistral-7b-instruct-v0.2.Q4_K_M.gguf sample_data\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!sha256sum mistral-7b-instruct-v0.2.Q4_K_M.gguf"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "V-1QMKA3d4_u",
"outputId": "10f59e33-f863-415f-ad99-b49a71bf4e85"
},
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"3e0039fd0273fcbebb49228943b17831aadd55cbcbf56f0af00499be2040ccf9 mistral-7b-instruct-v0.2.Q4_K_M.gguf\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"llm_model = Llama(\n",
" model_path=model_path,\n",
" n_gpu_layers=32,\n",
" n_ctx=2048,\n",
" chat_format=\"chatml\",\n",
" n_threads = 8,\n",
" draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines.\n",
" )"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YXJuz5P7ct_C",
"outputId": "d8a45718-cfb2-49ed-c42e-18109b9ac661"
},
"execution_count": 18,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from mistral-7b-instruct-v0.2.Q4_K_M.gguf (version GGUF V3 (latest))\n",
"llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n",
"llama_model_loader: - kv 0: general.architecture str = llama\n",
"llama_model_loader: - kv 1: general.name str = mistralai_mistral-7b-instruct-v0.2\n",
"llama_model_loader: - kv 2: llama.context_length u32 = 32768\n",
"llama_model_loader: - kv 3: llama.embedding_length u32 = 4096\n",
"llama_model_loader: - kv 4: llama.block_count u32 = 32\n",
"llama_model_loader: - kv 5: llama.feed_forward_length u32 = 14336\n",
"llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128\n",
"llama_model_loader: - kv 7: llama.attention.head_count u32 = 32\n",
"llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 8\n",
"llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010\n",
"llama_model_loader: - kv 10: llama.rope.freq_base f32 = 1000000.000000\n",
"llama_model_loader: - kv 11: general.file_type u32 = 15\n",
"llama_model_loader: - kv 12: tokenizer.ggml.model str = llama\n",
"llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,32000] = [\"<unk>\", \"<s>\", \"</s>\", \"<0x00>\", \"<...\n",
"llama_model_loader: - kv 14: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000...\n",
"llama_model_loader: - kv 15: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n",
"llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 1\n",
"llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 2\n",
"llama_model_loader: - kv 18: tokenizer.ggml.unknown_token_id u32 = 0\n",
"llama_model_loader: - kv 19: tokenizer.ggml.padding_token_id u32 = 0\n",
"llama_model_loader: - kv 20: tokenizer.ggml.add_bos_token bool = true\n",
"llama_model_loader: - kv 21: tokenizer.ggml.add_eos_token bool = false\n",
"llama_model_loader: - kv 22: tokenizer.chat_template str = {{ bos_token }}{% for message in mess...\n",
"llama_model_loader: - kv 23: general.quantization_version u32 = 2\n",
"llama_model_loader: - type f32: 65 tensors\n",
"llama_model_loader: - type q4_K: 193 tensors\n",
"llama_model_loader: - type q6_K: 33 tensors\n",
"llm_load_vocab: special tokens definition check successful ( 259/32000 ).\n",
"llm_load_print_meta: format = GGUF V3 (latest)\n",
"llm_load_print_meta: arch = llama\n",
"llm_load_print_meta: vocab type = SPM\n",
"llm_load_print_meta: n_vocab = 32000\n",
"llm_load_print_meta: n_merges = 0\n",
"llm_load_print_meta: n_ctx_train = 32768\n",
"llm_load_print_meta: n_embd = 4096\n",
"llm_load_print_meta: n_head = 32\n",
"llm_load_print_meta: n_head_kv = 8\n",
"llm_load_print_meta: n_layer = 32\n",
"llm_load_print_meta: n_rot = 128\n",
"llm_load_print_meta: n_embd_head_k = 128\n",
"llm_load_print_meta: n_embd_head_v = 128\n",
"llm_load_print_meta: n_gqa = 4\n",
"llm_load_print_meta: n_embd_k_gqa = 1024\n",
"llm_load_print_meta: n_embd_v_gqa = 1024\n",
"llm_load_print_meta: f_norm_eps = 0.0e+00\n",
"llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n",
"llm_load_print_meta: f_clamp_kqv = 0.0e+00\n",
"llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
"llm_load_print_meta: n_ff = 14336\n",
"llm_load_print_meta: n_expert = 0\n",
"llm_load_print_meta: n_expert_used = 0\n",
"llm_load_print_meta: pooling type = 0\n",
"llm_load_print_meta: rope type = 0\n",
"llm_load_print_meta: rope scaling = linear\n",
"llm_load_print_meta: freq_base_train = 1000000.0\n",
"llm_load_print_meta: freq_scale_train = 1\n",
"llm_load_print_meta: n_yarn_orig_ctx = 32768\n",
"llm_load_print_meta: rope_finetuned = unknown\n",
"llm_load_print_meta: ssm_d_conv = 0\n",
"llm_load_print_meta: ssm_d_inner = 0\n",
"llm_load_print_meta: ssm_d_state = 0\n",
"llm_load_print_meta: ssm_dt_rank = 0\n",
"llm_load_print_meta: model type = 7B\n",
"llm_load_print_meta: model ftype = Q4_K - Medium\n",
"llm_load_print_meta: model params = 7.24 B\n",
"llm_load_print_meta: model size = 4.07 GiB (4.83 BPW) \n",
"llm_load_print_meta: general.name = mistralai_mistral-7b-instruct-v0.2\n",
"llm_load_print_meta: BOS token = 1 '<s>'\n",
"llm_load_print_meta: EOS token = 2 '</s>'\n",
"llm_load_print_meta: UNK token = 0 '<unk>'\n",
"llm_load_print_meta: PAD token = 0 '<unk>'\n",
"llm_load_print_meta: LF token = 13 '<0x0A>'\n",
"llm_load_tensors: ggml ctx size = 0.22 MiB\n",
"llm_load_tensors: offloading 32 repeating layers to GPU\n",
"llm_load_tensors: offloaded 32/33 layers to GPU\n",
"llm_load_tensors: CPU buffer size = 4165.37 MiB\n",
"llm_load_tensors: CUDA0 buffer size = 3992.50 MiB\n",
".................................................................................................\n",
"llama_new_context_with_model: n_ctx = 2048\n",
"llama_new_context_with_model: freq_base = 1000000.0\n",
"llama_new_context_with_model: freq_scale = 1\n",
"llama_kv_cache_init: CUDA0 KV buffer size = 256.00 MiB\n",
"llama_new_context_with_model: KV self size = 256.00 MiB, K (f16): 128.00 MiB, V (f16): 128.00 MiB\n",
"llama_new_context_with_model: CUDA_Host input buffer size = 13.02 MiB\n",
"llama_new_context_with_model: CUDA0 compute buffer size = 164.00 MiB\n",
"llama_new_context_with_model: CUDA_Host compute buffer size = 78.50 MiB\n",
"llama_new_context_with_model: graph splits (measure): 3\n",
"AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | \n",
"Model metadata: {'tokenizer.chat_template': \"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}\", 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '1000000.000000', 'llama.context_length': '32768', 'general.name': 'mistralai_mistral-7b-instruct-v0.2', 'tokenizer.ggml.add_bos_token': 'true', 'llama.embedding_length': '4096', 'llama.feed_forward_length': '14336', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.dimension_count': '128', 'tokenizer.ggml.bos_token_id': '1', 'llama.attention.head_count': '32', 'llama.block_count': '32', 'llama.attention.head_count_kv': '8', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'llama', 'general.file_type': '15'}\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"llm_model.verbose = False"
],
"metadata": {
"id": "ffZ88Q3veVa3"
},
"execution_count": 19,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## Time with LlamaPromptLookupDecoding"
],
"metadata": {
"id": "W-i9-md8om14"
}
},
{
"cell_type": "code",
"source": [
"import time\n",
"import statistics\n",
"\n",
"messages = [\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\": \"You are a helpful assistant that outputs in JSON.\",\n",
" }\n",
"]\n",
"\n",
"persons = [\n",
" \"Mozart\",\n",
" \"Einstein\",\n",
" \"Beethoven\",\n",
" \"Newton\",\n",
" \"Shakespeare\",\n",
" \"Galileo\",\n",
" \"Da Vinci\",\n",
" \"Tesla\",\n",
" \"Curie\",\n",
" \"Bach\",\n",
" \"Picasso\",\n",
" \"Gutenberg\",\n",
" \"Darwin\",\n",
" \"Hemingway\",\n",
" \"Franklin\",\n",
" \"Hawking\",\n",
" \"Turing\",\n",
" \"Chopin\",\n",
" \"Gandhi\",\n",
" \"Lennon\"\n",
"]\n",
"\n",
"response_format = {\n",
" \"type\": \"json_object\",\n",
" \"schema\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"born_in_Germany\": {\"type\": \"boolean\"},\n",
" },\n",
" \"required\": [\"born_in_Germany\"],\n",
" },\n",
"}\n",
"\n",
"times = []\n",
"outputs = []\n",
"for person in persons:\n",
" start_time = time.time()\n",
" messages.append({\"role\": \"user\", \"content\": f\"Was the following person born in Germany? person: {person}\"})\n",
" output = llm_model.create_chat_completion(\n",
" messages=messages,\n",
" response_format=response_format,\n",
" temperature=0.7,\n",
" )\n",
" end_time = time.time()\n",
" elapsed_time = end_time - start_time\n",
" outputs.append(output)\n",
" times.append(elapsed_time)\n",
" messages.pop() # Remove the user message for the next iteration\n",
"\n",
"# Print statistics\n",
"print(\"Time Statistics:\")\n",
"print(\"Minimum time:\", min(times))\n",
"print(\"Maximum time:\", max(times))\n",
"print(\"Average time:\", statistics.mean(times))\n",
"print(\"Median time:\", statistics.median(times))\n",
"print(\"Standard Deviation:\", statistics.stdev(times))\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "nLoHt3r4cueq",
"outputId": "71ed94bb-4063-47d5-f10c-619612ad5f7d"
},
"execution_count": 20,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Time Statistics:\n",
"Minimum time: 2.17183518409729\n",
"Maximum time: 3.079291343688965\n",
"Average time: 2.4745811223983765\n",
"Median time: 2.330846905708313\n",
"Standard Deviation: 0.2737162183714513\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## Time without LlamaPromptLookupDecoding"
],
"metadata": {
"id": "MBVbhQlBowyV"
}
},
{
"cell_type": "code",
"source": [
"llm_model.draft_model = None"
],
"metadata": {
"id": "IsblXWnTeZsY"
},
"execution_count": 21,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import time\n",
"import statistics\n",
"\n",
"messages = [\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\": \"You are a helpful assistant that outputs in JSON.\",\n",
" }\n",
"]\n",
"\n",
"persons = [\n",
" \"Mozart\",\n",
" \"Einstein\",\n",
" \"Beethoven\",\n",
" \"Newton\",\n",
" \"Shakespeare\",\n",
" \"Galileo\",\n",
" \"Da Vinci\",\n",
" \"Tesla\",\n",
" \"Curie\",\n",
" \"Bach\",\n",
" \"Picasso\",\n",
" \"Gutenberg\",\n",
" \"Darwin\",\n",
" \"Hemingway\",\n",
" \"Franklin\",\n",
" \"Hawking\",\n",
" \"Turing\",\n",
" \"Chopin\",\n",
" \"Gandhi\",\n",
" \"Lennon\"\n",
"]\n",
"\n",
"response_format = {\n",
" \"type\": \"json_object\",\n",
" \"schema\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"born_in_Germany\": {\"type\": \"boolean\"},\n",
" },\n",
" \"required\": [\"born_in_Germany\"],\n",
" },\n",
"}\n",
"\n",
"times = []\n",
"outputs = []\n",
"for person in persons:\n",
" start_time = time.time()\n",
" messages.append({\"role\": \"user\", \"content\": f\"Was the following person born in Germany? person: {person}\"})\n",
" output = llm_model.create_chat_completion(\n",
" messages=messages,\n",
" response_format=response_format,\n",
" temperature=0.7,\n",
" )\n",
" end_time = time.time()\n",
" elapsed_time = end_time - start_time\n",
" outputs.append(output)\n",
" times.append(elapsed_time)\n",
" messages.pop() # Remove the user message for the next iteration\n",
"\n",
"# Print statistics\n",
"print(\"Time Statistics:\")\n",
"print(\"Minimum time:\", min(times))\n",
"print(\"Maximum time:\", max(times))\n",
"print(\"Average time:\", statistics.mean(times))\n",
"print(\"Median time:\", statistics.median(times))\n",
"print(\"Standard Deviation:\", statistics.stdev(times))\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "wKvR5-aOl17U",
"outputId": "b149937b-2b43-4f57-fd4d-3d51fef9aa69"
},
"execution_count": 22,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Time Statistics:\n",
"Minimum time: 1.9035820960998535\n",
"Maximum time: 2.636012077331543\n",
"Average time: 2.138879108428955\n",
"Median time: 2.0756272077560425\n",
"Standard Deviation: 0.19751100449869458\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"print(outputs)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ZrjGJpQ3mYZP",
"outputId": "59071388-9276-470e-fce3-05dd51711700"
},
"execution_count": 14,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[{'id': 'chatcmpl-129dcc16-fc5b-4a34-9f6b-8e4e4330f3e4', 'object': 'chat.completion', 'created': 1710514210, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 13, 'total_tokens': 79}}, {'id': 'chatcmpl-31159573-d1c5-43cd-b2d1-bf96a1ed97cf', 'object': 'chat.completion', 'created': 1710514212, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": true } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 65, 'completion_tokens': 13, 'total_tokens': 78}}, {'id': 'chatcmpl-dc8117a1-a6d6-4881-9ea5-6a605abbbed1', 'object': 'chat.completion', 'created': 1710514215, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": true } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 13, 'total_tokens': 80}}, {'id': 'chatcmpl-c41a65c4-5830-4855-bbe2-b16b5fd975a2', 'object': 'chat.completion', 'created': 1710514217, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\": false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 65, 'completion_tokens': 12, 'total_tokens': 77}}, {'id': 'chatcmpl-865d3251-36a0-4297-b0ae-1d93845fefb3', 'object': 'chat.completion', 'created': 1710514219, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\": false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 65, 'completion_tokens': 12, 'total_tokens': 77}}, {'id': 'chatcmpl-3ae5ecde-d0aa-4396-8df6-f2c3a544e62b', 'object': 'chat.completion', 'created': 1710514221, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\": false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 12, 'total_tokens': 79}}, {'id': 'chatcmpl-d1d4f661-31fa-43ef-8f3a-de822f58c6c9', 'object': 'chat.completion', 'created': 1710514223, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 13, 'total_tokens': 80}}, {'id': 'chatcmpl-0072defb-2b67-495c-9c5c-8c87ba00f219', 'object': 'chat.completion', 'created': 1710514225, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 13, 'total_tokens': 79}}, {'id': 'chatcmpl-2f9ef1cf-f27f-4c7f-b9fe-a66032889961', 'object': 'chat.completion', 'created': 1710514228, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 13, 'total_tokens': 79}}, {'id': 'chatcmpl-6f22c04e-8cbd-429a-80bc-1d8239e631e6', 'object': 'chat.completion', 'created': 1710514230, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": true } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 65, 'completion_tokens': 13, 'total_tokens': 78}}, {'id': 'chatcmpl-897602ff-aa5b-4e6d-aded-a33cb3fad259', 'object': 'chat.completion', 'created': 1710514232, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 13, 'total_tokens': 79}}, {'id': 'chatcmpl-823ef398-bea2-4c90-af3c-0aa0b1c80969', 'object': 'chat.completion', 'created': 1710514234, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": true } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 13, 'total_tokens': 80}}, {'id': 'chatcmpl-06071170-4683-4638-bb02-7942482ab308', 'object': 'chat.completion', 'created': 1710514236, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 65, 'completion_tokens': 13, 'total_tokens': 78}}, {'id': 'chatcmpl-4949dc54-45ae-47b6-b4d3-764d2677e3bd', 'object': 'chat.completion', 'created': 1710514238, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\": false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 12, 'total_tokens': 79}}, {'id': 'chatcmpl-5370272a-b44f-4f33-b59d-a475131945c9', 'object': 'chat.completion', 'created': 1710514241, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 65, 'completion_tokens': 13, 'total_tokens': 78}}, {'id': 'chatcmpl-f3c18cd5-7df6-403d-afbf-d88aabb6ab25', 'object': 'chat.completion', 'created': 1710514243, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\": false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 12, 'total_tokens': 78}}, {'id': 'chatcmpl-d3a18fb5-e1d5-42b9-88f9-1942e819c552', 'object': 'chat.completion', 'created': 1710514245, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 13, 'total_tokens': 79}}, {'id': 'chatcmpl-dd064549-c240-4c42-a80d-fb99e75133d7', 'object': 'chat.completion', 'created': 1710514247, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\":false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 13, 'total_tokens': 80}}, {'id': 'chatcmpl-10c5933b-5796-4317-9207-560b706da4bf', 'object': 'chat.completion', 'created': 1710514249, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\": false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 12, 'total_tokens': 78}}, {'id': 'chatcmpl-52cbc08d-1c5b-4e2d-8873-55d182a2452e', 'object': 'chat.completion', 'created': 1710514251, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 13, 'total_tokens': 80}}]\n"
]
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "Gxx7MbrsmZ-W"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment