Garstig/speetest-llama-LookupDecoding.ipynb

## speetest-llama-LookupDecoding.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4",
      "authorship_tag": "ABX9TyOush5N5IhCtLKyFy+o6tez",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/Garstig/be3825e7a8dc4ffdb4b76a2509553081/speetest-llama.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "HwTl8I3wbl6v",
        "outputId": "dcb5bed1-22d5-47fd-f611-a32400d192c3"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "--2024-03-15 14:22:19--  https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf?download=true\n",
            "Resolving huggingface.co (huggingface.co)... 13.33.33.110, 13.33.33.20, 13.33.33.55, ...\n",
            "Connecting to huggingface.co (huggingface.co)|13.33.33.110|:443... connected.\n",
            "HTTP request sent, awaiting response... 302 Found\n",
            "Location: https://cdn-lfs-us-1.huggingface.co/repos/72/62/726219e98582d16c24a66629a4dec1b0761b91c918e15dea2625b4293c134a92/3e0039fd0273fcbebb49228943b17831aadd55cbcbf56f0af00499be2040ccf9?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27mistral-7b-instruct-v0.2.Q4_K_M.gguf%3B+filename%3D%22mistral-7b-instruct-v0.2.Q4_K_M.gguf%22%3B&Expires=1710771740&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxMDc3MTc0MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzcyLzYyLzcyNjIxOWU5ODU4MmQxNmMyNGE2NjYyOWE0ZGVjMWIwNzYxYjkxYzkxOGUxNWRlYTI2MjViNDI5M2MxMzRhOTIvM2UwMDM5ZmQwMjczZmNiZWJiNDkyMjg5NDNiMTc4MzFhYWRkNTVjYmNiZjU2ZjBhZjAwNDk5YmUyMDQwY2NmOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=DwxjckiCSLZxRbJMcJqoFrILNQeoCHE0PNPasxh0cYe36C239tf1pIPUIZJFcfIi9a7SIDkaRGqKZs%7E6kkf-pNp7zhRKvrmHdJ1bryhxLK16RgTzf5AdpybJTq8lDr8GTPLU4adrQ3mfM9kLFTlS88ADHCjzUgi1p7oOiPgDTY3mNHzsO9PYEfKYeHod67OljkGYh-wmHELAkN08CWcrsbMOhDQsabuamIeyytjOGCco9hc9JymeX1AubfRcqrP1PdSIYASTggquqntvYijucoPAAJPYm1AC2nIv-SLplw2R2%7ELgXqilzM%7Ep7266XZHCbP0sCSdErqZJcC10%7EvBCdQ__&Key-Pair-Id=KCD77M1F0VK2B [following]\n",
            "--2024-03-15 14:22:20--  https://cdn-lfs-us-1.huggingface.co/repos/72/62/726219e98582d16c24a66629a4dec1b0761b91c918e15dea2625b4293c134a92/3e0039fd0273fcbebb49228943b17831aadd55cbcbf56f0af00499be2040ccf9?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27mistral-7b-instruct-v0.2.Q4_K_M.gguf%3B+filename%3D%22mistral-7b-instruct-v0.2.Q4_K_M.gguf%22%3B&Expires=1710771740&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxMDc3MTc0MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzcyLzYyLzcyNjIxOWU5ODU4MmQxNmMyNGE2NjYyOWE0ZGVjMWIwNzYxYjkxYzkxOGUxNWRlYTI2MjViNDI5M2MxMzRhOTIvM2UwMDM5ZmQwMjczZmNiZWJiNDkyMjg5NDNiMTc4MzFhYWRkNTVjYmNiZjU2ZjBhZjAwNDk5YmUyMDQwY2NmOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=DwxjckiCSLZxRbJMcJqoFrILNQeoCHE0PNPasxh0cYe36C239tf1pIPUIZJFcfIi9a7SIDkaRGqKZs%7E6kkf-pNp7zhRKvrmHdJ1bryhxLK16RgTzf5AdpybJTq8lDr8GTPLU4adrQ3mfM9kLFTlS88ADHCjzUgi1p7oOiPgDTY3mNHzsO9PYEfKYeHod67OljkGYh-wmHELAkN08CWcrsbMOhDQsabuamIeyytjOGCco9hc9JymeX1AubfRcqrP1PdSIYASTggquqntvYijucoPAAJPYm1AC2nIv-SLplw2R2%7ELgXqilzM%7Ep7266XZHCbP0sCSdErqZJcC10%7EvBCdQ__&Key-Pair-Id=KCD77M1F0VK2B\n",
            "Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 108.157.254.127, 108.157.254.63, 108.157.254.89, ...\n",
            "Connecting to cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)|108.157.254.127|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 4368439584 (4.1G) [binary/octet-stream]\n",
            "Saving to: ‘mistral-7b-instruct-v0.2.Q4_K_M.gguf?download=true’\n",
            "\n",
            "mistral-7b-instruct 100%[===================>]   4.07G   230MB/s    in 22s     \n",
            "\n",
            "2024-03-15 14:22:42 (191 MB/s) - ‘mistral-7b-instruct-v0.2.Q4_K_M.gguf?download=true’ saved [4368439584/4368439584]\n",
            "\n"
          ]
        }
      ],
      "source": [
        "!wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf?download=true"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!mv mistral-7b-instruct-v0.2.Q4_K_M.gguf?download=true mistral-7b-instruct-v0.2.Q4_K_M.gguf"
      ],
      "metadata": {
        "id": "_3Mw1FBUc8AP"
      },
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#!CMAKE_ARGS=\"-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS\" pip install llama-cpp-python"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "CVP9bquBcg5n",
        "outputId": "17b6af53-eb7d-4a2f-fb00-83485a538783"
      },
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting llama-cpp-python\n",
            "  Downloading llama_cpp_python-0.2.56.tar.gz (36.9 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.9/36.9 MB\u001b[0m \u001b[31m15.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
            "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
            "  Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n",
            "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
            "Requirement already satisfied: typing-extensions>=4.5.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (4.10.0)\n",
            "Requirement already satisfied: numpy>=1.20.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (1.25.2)\n",
            "Collecting diskcache>=5.6.1 (from llama-cpp-python)\n",
            "  Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.5/45.5 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: jinja2>=2.11.3 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (3.1.3)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2>=2.11.3->llama-cpp-python) (2.1.5)\n",
            "Building wheels for collected packages: llama-cpp-python\n",
            "  Building wheel for llama-cpp-python (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.56-cp310-cp310-manylinux_2_35_x86_64.whl size=2841182 sha256=1e91e97a338bc2daa1cc72d23c4f0d41ed972b1e0e4796d73e3a7e28b178f077\n",
            "  Stored in directory: /root/.cache/pip/wheels/e5/09/9d/c413053f6258cb2546cc792418c595e276f9efd5db31a80377\n",
            "Successfully built llama-cpp-python\n",
            "Installing collected packages: diskcache, llama-cpp-python\n",
            "Successfully installed diskcache-5.6.3 llama-cpp-python-0.2.56\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!CMAKE_ARGS=\"-DLLAMA_CUBLAS=on\" FORCE_CMAKE=1 pip install llama-cpp-python"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Tz6FJpOufWXL",
        "outputId": "8ce6419b-1e6c-4a09-959b-be199a513111"
      },
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting llama-cpp-python\n",
            "  Downloading llama_cpp_python-0.2.56.tar.gz (36.9 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.9/36.9 MB\u001b[0m \u001b[31m17.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
            "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
            "  Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n",
            "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
            "Requirement already satisfied: typing-extensions>=4.5.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (4.10.0)\n",
            "Requirement already satisfied: numpy>=1.20.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (1.25.2)\n",
            "Collecting diskcache>=5.6.1 (from llama-cpp-python)\n",
            "  Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.5/45.5 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: jinja2>=2.11.3 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (3.1.3)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2>=2.11.3->llama-cpp-python) (2.1.5)\n",
            "Building wheels for collected packages: llama-cpp-python\n",
            "  Building wheel for llama-cpp-python (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.56-cp310-cp310-manylinux_2_35_x86_64.whl size=26291647 sha256=4d08ee7d43ebc79a5c7c0d302e11645c6a07c79c5e1c2391514742ed1842af65\n",
            "  Stored in directory: /root/.cache/pip/wheels/e5/09/9d/c413053f6258cb2546cc792418c595e276f9efd5db31a80377\n",
            "Successfully built llama-cpp-python\n",
            "Installing collected packages: diskcache, llama-cpp-python\n",
            "Successfully installed diskcache-5.6.3 llama-cpp-python-0.2.56\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from  llama_cpp import Llama\n",
        "import json\n",
        "from llama_cpp.llama_speculative import LlamaPromptLookupDecoding"
      ],
      "metadata": {
        "id": "piOITZL8bshq"
      },
      "execution_count": 4,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "model_path = \"mistral-7b-instruct-v0.2.Q4_K_M.gguf\"\n"
      ],
      "metadata": {
        "id": "CmA0k1DFcJJ5"
      },
      "execution_count": 5,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!ls"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "HzAhT3sOdGS4",
        "outputId": "a5636d18-a4fd-4b5e-ccb3-dcd0f3943cbc"
      },
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "mistral-7b-instruct-v0.2.Q4_K_M.gguf  sample_data\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!sha256sum mistral-7b-instruct-v0.2.Q4_K_M.gguf"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "V-1QMKA3d4_u",
        "outputId": "10f59e33-f863-415f-ad99-b49a71bf4e85"
      },
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "3e0039fd0273fcbebb49228943b17831aadd55cbcbf56f0af00499be2040ccf9  mistral-7b-instruct-v0.2.Q4_K_M.gguf\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "llm_model = Llama(\n",
        "            model_path=model_path,\n",
        "            n_gpu_layers=32,\n",
        "            n_ctx=2048,\n",
        "            chat_format=\"chatml\",\n",
        "            n_threads = 8,\n",
        "            draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines.\n",
        "        )"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "YXJuz5P7ct_C",
        "outputId": "d8a45718-cfb2-49ed-c42e-18109b9ac661"
      },
      "execution_count": 18,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from mistral-7b-instruct-v0.2.Q4_K_M.gguf (version GGUF V3 (latest))\n",
            "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n",
            "llama_model_loader: - kv   0:                       general.architecture str              = llama\n",
            "llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2\n",
            "llama_model_loader: - kv   2:                       llama.context_length u32              = 32768\n",
            "llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096\n",
            "llama_model_loader: - kv   4:                          llama.block_count u32              = 32\n",
            "llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336\n",
            "llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128\n",
            "llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 32\n",
            "llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 8\n",
            "llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010\n",
            "llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000\n",
            "llama_model_loader: - kv  11:                          general.file_type u32              = 15\n",
            "llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama\n",
            "llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32000]   = [\"<unk>\", \"<s>\", \"</s>\", \"<0x00>\", \"<...\n",
            "llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32000]   = [0.000000, 0.000000, 0.000000, 0.0000...\n",
            "llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32000]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n",
            "llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1\n",
            "llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2\n",
            "llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0\n",
            "llama_model_loader: - kv  19:            tokenizer.ggml.padding_token_id u32              = 0\n",
            "llama_model_loader: - kv  20:               tokenizer.ggml.add_bos_token bool             = true\n",
            "llama_model_loader: - kv  21:               tokenizer.ggml.add_eos_token bool             = false\n",
            "llama_model_loader: - kv  22:                    tokenizer.chat_template str              = {{ bos_token }}{% for message in mess...\n",
            "llama_model_loader: - kv  23:               general.quantization_version u32              = 2\n",
            "llama_model_loader: - type  f32:   65 tensors\n",
            "llama_model_loader: - type q4_K:  193 tensors\n",
            "llama_model_loader: - type q6_K:   33 tensors\n",
            "llm_load_vocab: special tokens definition check successful ( 259/32000 ).\n",
            "llm_load_print_meta: format           = GGUF V3 (latest)\n",
            "llm_load_print_meta: arch             = llama\n",
            "llm_load_print_meta: vocab type       = SPM\n",
            "llm_load_print_meta: n_vocab          = 32000\n",
            "llm_load_print_meta: n_merges         = 0\n",
            "llm_load_print_meta: n_ctx_train      = 32768\n",
            "llm_load_print_meta: n_embd           = 4096\n",
            "llm_load_print_meta: n_head           = 32\n",
            "llm_load_print_meta: n_head_kv        = 8\n",
            "llm_load_print_meta: n_layer          = 32\n",
            "llm_load_print_meta: n_rot            = 128\n",
            "llm_load_print_meta: n_embd_head_k    = 128\n",
            "llm_load_print_meta: n_embd_head_v    = 128\n",
            "llm_load_print_meta: n_gqa            = 4\n",
            "llm_load_print_meta: n_embd_k_gqa     = 1024\n",
            "llm_load_print_meta: n_embd_v_gqa     = 1024\n",
            "llm_load_print_meta: f_norm_eps       = 0.0e+00\n",
            "llm_load_print_meta: f_norm_rms_eps   = 1.0e-05\n",
            "llm_load_print_meta: f_clamp_kqv      = 0.0e+00\n",
            "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
            "llm_load_print_meta: n_ff             = 14336\n",
            "llm_load_print_meta: n_expert         = 0\n",
            "llm_load_print_meta: n_expert_used    = 0\n",
            "llm_load_print_meta: pooling type     = 0\n",
            "llm_load_print_meta: rope type        = 0\n",
            "llm_load_print_meta: rope scaling     = linear\n",
            "llm_load_print_meta: freq_base_train  = 1000000.0\n",
            "llm_load_print_meta: freq_scale_train = 1\n",
            "llm_load_print_meta: n_yarn_orig_ctx  = 32768\n",
            "llm_load_print_meta: rope_finetuned   = unknown\n",
            "llm_load_print_meta: ssm_d_conv       = 0\n",
            "llm_load_print_meta: ssm_d_inner      = 0\n",
            "llm_load_print_meta: ssm_d_state      = 0\n",
            "llm_load_print_meta: ssm_dt_rank      = 0\n",
            "llm_load_print_meta: model type       = 7B\n",
            "llm_load_print_meta: model ftype      = Q4_K - Medium\n",
            "llm_load_print_meta: model params     = 7.24 B\n",
            "llm_load_print_meta: model size       = 4.07 GiB (4.83 BPW) \n",
            "llm_load_print_meta: general.name     = mistralai_mistral-7b-instruct-v0.2\n",
            "llm_load_print_meta: BOS token        = 1 '<s>'\n",
            "llm_load_print_meta: EOS token        = 2 '</s>'\n",
            "llm_load_print_meta: UNK token        = 0 '<unk>'\n",
            "llm_load_print_meta: PAD token        = 0 '<unk>'\n",
            "llm_load_print_meta: LF token         = 13 '<0x0A>'\n",
            "llm_load_tensors: ggml ctx size =    0.22 MiB\n",
            "llm_load_tensors: offloading 32 repeating layers to GPU\n",
            "llm_load_tensors: offloaded 32/33 layers to GPU\n",
            "llm_load_tensors:        CPU buffer size =  4165.37 MiB\n",
            "llm_load_tensors:      CUDA0 buffer size =  3992.50 MiB\n",
            ".................................................................................................\n",
            "llama_new_context_with_model: n_ctx      = 2048\n",
            "llama_new_context_with_model: freq_base  = 1000000.0\n",
            "llama_new_context_with_model: freq_scale = 1\n",
            "llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB\n",
            "llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB\n",
            "llama_new_context_with_model:  CUDA_Host input buffer size   =    13.02 MiB\n",
            "llama_new_context_with_model:      CUDA0 compute buffer size =   164.00 MiB\n",
            "llama_new_context_with_model:  CUDA_Host compute buffer size =    78.50 MiB\n",
            "llama_new_context_with_model: graph splits (measure): 3\n",
            "AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | \n",
            "Model metadata: {'tokenizer.chat_template': \"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}\", 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '1000000.000000', 'llama.context_length': '32768', 'general.name': 'mistralai_mistral-7b-instruct-v0.2', 'tokenizer.ggml.add_bos_token': 'true', 'llama.embedding_length': '4096', 'llama.feed_forward_length': '14336', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.dimension_count': '128', 'tokenizer.ggml.bos_token_id': '1', 'llama.attention.head_count': '32', 'llama.block_count': '32', 'llama.attention.head_count_kv': '8', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'llama', 'general.file_type': '15'}\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "llm_model.verbose = False"
      ],
      "metadata": {
        "id": "ffZ88Q3veVa3"
      },
      "execution_count": 19,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Time with LlamaPromptLookupDecoding"
      ],
      "metadata": {
        "id": "W-i9-md8om14"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import time\n",
        "import statistics\n",
        "\n",
        "messages = [\n",
        "    {\n",
        "        \"role\": \"system\",\n",
        "        \"content\": \"You are a helpful assistant that outputs in JSON.\",\n",
        "    }\n",
        "]\n",
        "\n",
        "persons = [\n",
        "    \"Mozart\",\n",
        "    \"Einstein\",\n",
        "    \"Beethoven\",\n",
        "    \"Newton\",\n",
        "    \"Shakespeare\",\n",
        "    \"Galileo\",\n",
        "    \"Da Vinci\",\n",
        "    \"Tesla\",\n",
        "    \"Curie\",\n",
        "    \"Bach\",\n",
        "    \"Picasso\",\n",
        "    \"Gutenberg\",\n",
        "    \"Darwin\",\n",
        "    \"Hemingway\",\n",
        "    \"Franklin\",\n",
        "    \"Hawking\",\n",
        "    \"Turing\",\n",
        "    \"Chopin\",\n",
        "    \"Gandhi\",\n",
        "    \"Lennon\"\n",
        "]\n",
        "\n",
        "response_format = {\n",
        "    \"type\": \"json_object\",\n",
        "    \"schema\": {\n",
        "        \"type\": \"object\",\n",
        "        \"properties\": {\n",
        "            \"born_in_Germany\": {\"type\": \"boolean\"},\n",
        "        },\n",
        "        \"required\": [\"born_in_Germany\"],\n",
        "    },\n",
        "}\n",
        "\n",
        "times = []\n",
        "outputs = []\n",
        "for person in persons:\n",
        "    start_time = time.time()\n",
        "    messages.append({\"role\": \"user\", \"content\": f\"Was the following person born in Germany? person: {person}\"})\n",
        "    output = llm_model.create_chat_completion(\n",
        "        messages=messages,\n",
        "        response_format=response_format,\n",
        "        temperature=0.7,\n",
        "    )\n",
        "    end_time = time.time()\n",
        "    elapsed_time = end_time - start_time\n",
        "    outputs.append(output)\n",
        "    times.append(elapsed_time)\n",
        "    messages.pop()  # Remove the user message for the next iteration\n",
        "\n",
        "# Print statistics\n",
        "print(\"Time Statistics:\")\n",
        "print(\"Minimum time:\", min(times))\n",
        "print(\"Maximum time:\", max(times))\n",
        "print(\"Average time:\", statistics.mean(times))\n",
        "print(\"Median time:\", statistics.median(times))\n",
        "print(\"Standard Deviation:\", statistics.stdev(times))\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "nLoHt3r4cueq",
        "outputId": "71ed94bb-4063-47d5-f10c-619612ad5f7d"
      },
      "execution_count": 20,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Time Statistics:\n",
            "Minimum time: 2.17183518409729\n",
            "Maximum time: 3.079291343688965\n",
            "Average time: 2.4745811223983765\n",
            "Median time: 2.330846905708313\n",
            "Standard Deviation: 0.2737162183714513\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Time without LlamaPromptLookupDecoding"
      ],
      "metadata": {
        "id": "MBVbhQlBowyV"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "llm_model.draft_model = None"
      ],
      "metadata": {
        "id": "IsblXWnTeZsY"
      },
      "execution_count": 21,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import time\n",
        "import statistics\n",
        "\n",
        "messages = [\n",
        "    {\n",
        "        \"role\": \"system\",\n",
        "        \"content\": \"You are a helpful assistant that outputs in JSON.\",\n",
        "    }\n",
        "]\n",
        "\n",
        "persons = [\n",
        "    \"Mozart\",\n",
        "    \"Einstein\",\n",
        "    \"Beethoven\",\n",
        "    \"Newton\",\n",
        "    \"Shakespeare\",\n",
        "    \"Galileo\",\n",
        "    \"Da Vinci\",\n",
        "    \"Tesla\",\n",
        "    \"Curie\",\n",
        "    \"Bach\",\n",
        "    \"Picasso\",\n",
        "    \"Gutenberg\",\n",
        "    \"Darwin\",\n",
        "    \"Hemingway\",\n",
        "    \"Franklin\",\n",
        "    \"Hawking\",\n",
        "    \"Turing\",\n",
        "    \"Chopin\",\n",
        "    \"Gandhi\",\n",
        "    \"Lennon\"\n",
        "]\n",
        "\n",
        "response_format = {\n",
        "    \"type\": \"json_object\",\n",
        "    \"schema\": {\n",
        "        \"type\": \"object\",\n",
        "        \"properties\": {\n",
        "            \"born_in_Germany\": {\"type\": \"boolean\"},\n",
        "        },\n",
        "        \"required\": [\"born_in_Germany\"],\n",
        "    },\n",
        "}\n",
        "\n",
        "times = []\n",
        "outputs = []\n",
        "for person in persons:\n",
        "    start_time = time.time()\n",
        "    messages.append({\"role\": \"user\", \"content\": f\"Was the following person born in Germany? person: {person}\"})\n",
        "    output = llm_model.create_chat_completion(\n",
        "        messages=messages,\n",
        "        response_format=response_format,\n",
        "        temperature=0.7,\n",
        "    )\n",
        "    end_time = time.time()\n",
        "    elapsed_time = end_time - start_time\n",
        "    outputs.append(output)\n",
        "    times.append(elapsed_time)\n",
        "    messages.pop()  # Remove the user message for the next iteration\n",
        "\n",
        "# Print statistics\n",
        "print(\"Time Statistics:\")\n",
        "print(\"Minimum time:\", min(times))\n",
        "print(\"Maximum time:\", max(times))\n",
        "print(\"Average time:\", statistics.mean(times))\n",
        "print(\"Median time:\", statistics.median(times))\n",
        "print(\"Standard Deviation:\", statistics.stdev(times))\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "wKvR5-aOl17U",
        "outputId": "b149937b-2b43-4f57-fd4d-3d51fef9aa69"
      },
      "execution_count": 22,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Time Statistics:\n",
            "Minimum time: 1.9035820960998535\n",
            "Maximum time: 2.636012077331543\n",
            "Average time: 2.138879108428955\n",
            "Median time: 2.0756272077560425\n",
            "Standard Deviation: 0.19751100449869458\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(outputs)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ZrjGJpQ3mYZP",
        "outputId": "59071388-9276-470e-fce3-05dd51711700"
      },
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[{'id': 'chatcmpl-129dcc16-fc5b-4a34-9f6b-8e4e4330f3e4', 'object': 'chat.completion', 'created': 1710514210, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 13, 'total_tokens': 79}}, {'id': 'chatcmpl-31159573-d1c5-43cd-b2d1-bf96a1ed97cf', 'object': 'chat.completion', 'created': 1710514212, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": true } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 65, 'completion_tokens': 13, 'total_tokens': 78}}, {'id': 'chatcmpl-dc8117a1-a6d6-4881-9ea5-6a605abbbed1', 'object': 'chat.completion', 'created': 1710514215, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": true } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 13, 'total_tokens': 80}}, {'id': 'chatcmpl-c41a65c4-5830-4855-bbe2-b16b5fd975a2', 'object': 'chat.completion', 'created': 1710514217, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\": false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 65, 'completion_tokens': 12, 'total_tokens': 77}}, {'id': 'chatcmpl-865d3251-36a0-4297-b0ae-1d93845fefb3', 'object': 'chat.completion', 'created': 1710514219, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\": false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 65, 'completion_tokens': 12, 'total_tokens': 77}}, {'id': 'chatcmpl-3ae5ecde-d0aa-4396-8df6-f2c3a544e62b', 'object': 'chat.completion', 'created': 1710514221, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\": false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 12, 'total_tokens': 79}}, {'id': 'chatcmpl-d1d4f661-31fa-43ef-8f3a-de822f58c6c9', 'object': 'chat.completion', 'created': 1710514223, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 13, 'total_tokens': 80}}, {'id': 'chatcmpl-0072defb-2b67-495c-9c5c-8c87ba00f219', 'object': 'chat.completion', 'created': 1710514225, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 13, 'total_tokens': 79}}, {'id': 'chatcmpl-2f9ef1cf-f27f-4c7f-b9fe-a66032889961', 'object': 'chat.completion', 'created': 1710514228, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 13, 'total_tokens': 79}}, {'id': 'chatcmpl-6f22c04e-8cbd-429a-80bc-1d8239e631e6', 'object': 'chat.completion', 'created': 1710514230, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": true } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 65, 'completion_tokens': 13, 'total_tokens': 78}}, {'id': 'chatcmpl-897602ff-aa5b-4e6d-aded-a33cb3fad259', 'object': 'chat.completion', 'created': 1710514232, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 13, 'total_tokens': 79}}, {'id': 'chatcmpl-823ef398-bea2-4c90-af3c-0aa0b1c80969', 'object': 'chat.completion', 'created': 1710514234, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": true } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 13, 'total_tokens': 80}}, {'id': 'chatcmpl-06071170-4683-4638-bb02-7942482ab308', 'object': 'chat.completion', 'created': 1710514236, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 65, 'completion_tokens': 13, 'total_tokens': 78}}, {'id': 'chatcmpl-4949dc54-45ae-47b6-b4d3-764d2677e3bd', 'object': 'chat.completion', 'created': 1710514238, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\": false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 12, 'total_tokens': 79}}, {'id': 'chatcmpl-5370272a-b44f-4f33-b59d-a475131945c9', 'object': 'chat.completion', 'created': 1710514241, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 65, 'completion_tokens': 13, 'total_tokens': 78}}, {'id': 'chatcmpl-f3c18cd5-7df6-403d-afbf-d88aabb6ab25', 'object': 'chat.completion', 'created': 1710514243, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\": false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 12, 'total_tokens': 78}}, {'id': 'chatcmpl-d3a18fb5-e1d5-42b9-88f9-1942e819c552', 'object': 'chat.completion', 'created': 1710514245, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 13, 'total_tokens': 79}}, {'id': 'chatcmpl-dd064549-c240-4c42-a80d-fb99e75133d7', 'object': 'chat.completion', 'created': 1710514247, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\":false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 13, 'total_tokens': 80}}, {'id': 'chatcmpl-10c5933b-5796-4317-9207-560b706da4bf', 'object': 'chat.completion', 'created': 1710514249, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\"born_in_Germany\": false} '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 66, 'completion_tokens': 12, 'total_tokens': 78}}, {'id': 'chatcmpl-52cbc08d-1c5b-4e2d-8873-55d182a2452e', 'object': 'chat.completion', 'created': 1710514251, 'model': 'mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{ \"born_in_Germany\": false } '}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 67, 'completion_tokens': 13, 'total_tokens': 80}}]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "Gxx7MbrsmZ-W"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}