alex4o/TestVulkanModels.ipynb

## TestVulkanModels.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "dfac4e16-0c49-496b-b7cc-b92e40bb1379",
   "metadata": {},
   "outputs": [],
   "source": [
    "import llama_cpp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "e6fae4eb-4db5-4ae1-bd7f-e86fc80e4714",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "llama_model_loader: loaded meta data with 21 key-value pairs and 325 tensors from ../llama.cpp/models/phi-2-orange.Q4_K_M.gguf (version GGUF V3 (latest))\n",
      "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n",
      "llama_model_loader: - kv   0:                       general.architecture str              = phi2\n",
      "llama_model_loader: - kv   1:                               general.name str              = Phi2\n",
      "llama_model_loader: - kv   2:                        phi2.context_length u32              = 2048\n",
      "llama_model_loader: - kv   3:                      phi2.embedding_length u32              = 2560\n",
      "llama_model_loader: - kv   4:                   phi2.feed_forward_length u32              = 10240\n",
      "llama_model_loader: - kv   5:                           phi2.block_count u32              = 32\n",
      "llama_model_loader: - kv   6:                  phi2.attention.head_count u32              = 32\n",
      "llama_model_loader: - kv   7:               phi2.attention.head_count_kv u32              = 32\n",
      "llama_model_loader: - kv   8:          phi2.attention.layer_norm_epsilon f32              = 0.000010\n",
      "llama_model_loader: - kv   9:                  phi2.rope.dimension_count u32              = 32\n",
      "llama_model_loader: - kv  10:                          general.file_type u32              = 15\n",
      "llama_model_loader: - kv  11:               tokenizer.ggml.add_bos_token bool             = false\n",
      "llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = gpt2\n",
      "llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,51200]   = [\"!\", \"\\\"\", \"#\", \"$\", \"%\", \"&\", \"'\", ...\n",
      "llama_model_loader: - kv  14:                  tokenizer.ggml.token_type arr[i32,51200]   = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\n",
      "llama_model_loader: - kv  15:                      tokenizer.ggml.merges arr[str,50000]   = [\"Ġ t\", \"Ġ a\", \"h e\", \"i n\", \"r e\",...\n",
      "llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 50256\n",
      "llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 50295\n",
      "llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 50256\n",
      "llama_model_loader: - kv  19:            tokenizer.ggml.padding_token_id u32              = 50256\n",
      "llama_model_loader: - kv  20:               general.quantization_version u32              = 2\n",
      "llama_model_loader: - type  f32:  195 tensors\n",
      "llama_model_loader: - type q4_K:   81 tensors\n",
      "llama_model_loader: - type q5_K:   32 tensors\n",
      "llama_model_loader: - type q6_K:   17 tensors\n",
      "llm_load_vocab: mismatch in special tokens definition ( 910/51200 vs 944/51200 ).\n",
      "llm_load_print_meta: format           = GGUF V3 (latest)\n",
      "llm_load_print_meta: arch             = phi2\n",
      "llm_load_print_meta: vocab type       = BPE\n",
      "llm_load_print_meta: n_vocab          = 51200\n",
      "llm_load_print_meta: n_merges         = 50000\n",
      "llm_load_print_meta: n_ctx_train      = 2048\n",
      "llm_load_print_meta: n_embd           = 2560\n",
      "llm_load_print_meta: n_head           = 32\n",
      "llm_load_print_meta: n_head_kv        = 32\n",
      "llm_load_print_meta: n_layer          = 32\n",
      "llm_load_print_meta: n_rot            = 32\n",
      "llm_load_print_meta: n_embd_head_k    = 80\n",
      "llm_load_print_meta: n_embd_head_v    = 80\n",
      "llm_load_print_meta: n_gqa            = 1\n",
      "llm_load_print_meta: n_embd_k_gqa     = 2560\n",
      "llm_load_print_meta: n_embd_v_gqa     = 2560\n",
      "llm_load_print_meta: f_norm_eps       = 1.0e-05\n",
      "llm_load_print_meta: f_norm_rms_eps   = 0.0e+00\n",
      "llm_load_print_meta: f_clamp_kqv      = 0.0e+00\n",
      "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
      "llm_load_print_meta: n_ff             = 10240\n",
      "llm_load_print_meta: n_expert         = 0\n",
      "llm_load_print_meta: n_expert_used    = 0\n",
      "llm_load_print_meta: rope scaling     = linear\n",
      "llm_load_print_meta: freq_base_train  = 10000.0\n",
      "llm_load_print_meta: freq_scale_train = 1\n",
      "llm_load_print_meta: n_yarn_orig_ctx  = 2048\n",
      "llm_load_print_meta: rope_finetuned   = unknown\n",
      "llm_load_print_meta: model type       = 3B\n",
      "llm_load_print_meta: model ftype      = Q4_K - Medium\n",
      "llm_load_print_meta: model params     = 2.78 B\n",
      "llm_load_print_meta: model size       = 1.66 GiB (5.14 BPW) \n",
      "llm_load_print_meta: general.name     = Phi2\n",
      "llm_load_print_meta: BOS token        = 50256 '<|endoftext|>'\n",
      "llm_load_print_meta: EOS token        = 50295 '<|im_end|>'\n",
      "llm_load_print_meta: UNK token        = 50256 '<|endoftext|>'\n",
      "llm_load_print_meta: PAD token        = 50256 '<|endoftext|>'\n",
      "llm_load_print_meta: LF token         = 128 'Ä'\n",
      "llm_load_tensors: ggml ctx size =    0.12 MiB\n",
      "llm_load_tensors: offloading 0 repeating layers to GPU\n",
      "llm_load_tensors: offloaded 0/33 layers to GPU\n",
      "llm_load_tensors:        CPU buffer size =  1704.63 MiB\n",
      "..................................................................................\n",
      "llama_new_context_with_model: n_ctx      = 512\n",
      "llama_new_context_with_model: freq_base  = 10000.0\n",
      "llama_new_context_with_model: freq_scale = 1\n",
      "llama_kv_cache_init: Vulkan_Host KV buffer size =   160.00 MiB\n",
      "llama_new_context_with_model: KV self size  =  160.00 MiB, K (f16):   80.00 MiB, V (f16):   80.00 MiB\n",
      "llama_new_context_with_model: Vulkan_Host input buffer size   =     6.01 MiB\n",
      "AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | \n",
      "llama_new_context_with_model: Vulkan_Host compute buffer size =   115.50 MiB\n",
      "llama_new_context_with_model: graph splits (measure): 1\n",
      "Model metadata: {'tokenizer.ggml.padding_token_id': '50256', 'tokenizer.ggml.unknown_token_id': '50256', 'tokenizer.ggml.eos_token_id': '50295', 'tokenizer.ggml.bos_token_id': '50256', 'general.architecture': 'phi2', 'general.name': 'Phi2', 'phi2.context_length': '2048', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'gpt2', 'tokenizer.ggml.add_bos_token': 'false', 'phi2.embedding_length': '2560', 'phi2.attention.head_count': '32', 'phi2.attention.head_count_kv': '32', 'phi2.feed_forward_length': '10240', 'phi2.attention.layer_norm_epsilon': '0.000010', 'phi2.block_count': '32', 'phi2.rope.dimension_count': '32', 'general.file_type': '15'}\n"
     ]
    }
   ],
   "source": [
    "model = llama_cpp.Llama(\n",
    "    model_path=\"../llama.cpp/models/phi-2-orange.Q4_K_M.gguf\",\n",
    "    chat_format=\"chatml\",\n",
    "    seed=123123123\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "86084e47-d363-497d-a148-577f4a905cee",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'id': 'chatcmpl-3c5a7e39-f4d0-43e7-b0d3-6fe02801ac4f', 'object': 'chat.completion', 'created': 1706566757, 'model': '../llama.cpp/models/phi-2-orange.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': \"The meaning of life, a philosophical and existential question that has puzzled humanity for centuries, can be approached from various perspectives. Here are some common interpretations:\\n\\n1. Religious Perspective: Many religious traditions offer their own answers to this profound question. For instance, in Christianity, the Bible suggests that humans have been created in God's image with the purpose of loving and serving one another. The ultimate goal is to find eternal happiness through faith in Jesus Christ. In Islam, life is seen as a test from Allah (God), and the meaning lies in living a righteous and ethical life while striving for spiritual growth.\\n2. Humanistic Perspective: This perspective emphasizes human potential and personal fulfillment. It suggests that the meaning of life comes from achieving self-actualization, which involves discovering one's unique talents, passions, and values. The ultimate goal is to live authentically, contribute positively to society, and find happiness through personal growth and meaningful relationships.\\n3. Existential Perspective: This perspective focuses on individual freedom, choice, and responsibility in creating meaning from the inherent absurdity of life. It suggests that humans must confront their mortality and create their own purpose by embracing uncertainty, taking risks, and finding value in the present moment. The ultimate goal is to live authentically, pursue personal growth, and find fulfillment through self-discovery and meaningful connections with others.\\n4. Scientific Perspective: Some scientists argue that the meaning of life can be found in understanding our place in the universe, exploring the mysteries of nature, and contributing to human knowledge and progress. The ultimate goal is to live a purposeful life by pursuing scientific inquiry, innovation, and discovery while respecting the natural world and promoting sustainability.\\n5. Philosophical Perspective: Various philosophical traditions offer their own interpretations of the meaning of life. For example, existentialism emphasizes individual freedom and choice in creating personal meaning, while nihilism suggests that life has no inherent purpose or value. Other philosophies, such as Stoicism and Buddhism, emphasize living a balanced and virtuous life to find happiness and fulfillment.\\n\\nUltimately, the meaning of life is subjective and can vary greatly from person to person based on their beliefs, values, experiences, and goals. It's an ongoing journey that requires self-reflection, exploration, and personal growth to discover what brings us true happiness and purpose in this complex world.\"}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 22, 'completion_tokens': 473, 'total_tokens': 495}}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "llama_print_timings:        load time =     260.27 ms\n",
      "llama_print_timings:      sample time =      84.26 ms /   474 runs   (    0.18 ms per token,  5625.18 tokens per second)\n",
      "llama_print_timings: prompt eval time =     260.24 ms /    22 tokens (   11.83 ms per token,    84.54 tokens per second)\n",
      "llama_print_timings:        eval time =   22961.10 ms /   473 runs   (   48.54 ms per token,    20.60 tokens per second)\n",
      "llama_print_timings:       total time =   24566.66 ms /   495 tokens\n"
     ]
    }
   ],
   "source": [
    "print(model.create_chat_completion(\n",
    "     messages=[{\n",
    "         \"role\": \"user\",\n",
    "         \"content\": \"what is the meaning of life?\"\n",
    "     }]\n",
    "))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "0478bded-84c4-4ee8-923e-8c6e2c142471",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "llama_model_loader: loaded meta data with 21 key-value pairs and 325 tensors from ../llama.cpp/models/phi-2-orange.Q4_K_M.gguf (version GGUF V3 (latest))\n",
      "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n",
      "llama_model_loader: - kv   0:                       general.architecture str              = phi2\n",
      "llama_model_loader: - kv   1:                               general.name str              = Phi2\n",
      "llama_model_loader: - kv   2:                        phi2.context_length u32              = 2048\n",
      "llama_model_loader: - kv   3:                      phi2.embedding_length u32              = 2560\n",
      "llama_model_loader: - kv   4:                   phi2.feed_forward_length u32              = 10240\n",
      "llama_model_loader: - kv   5:                           phi2.block_count u32              = 32\n",
      "llama_model_loader: - kv   6:                  phi2.attention.head_count u32              = 32\n",
      "llama_model_loader: - kv   7:               phi2.attention.head_count_kv u32              = 32\n",
      "llama_model_loader: - kv   8:          phi2.attention.layer_norm_epsilon f32              = 0.000010\n",
      "llama_model_loader: - kv   9:                  phi2.rope.dimension_count u32              = 32\n",
      "llama_model_loader: - kv  10:                          general.file_type u32              = 15\n",
      "llama_model_loader: - kv  11:               tokenizer.ggml.add_bos_token bool             = false\n",
      "llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = gpt2\n",
      "llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,51200]   = [\"!\", \"\\\"\", \"#\", \"$\", \"%\", \"&\", \"'\", ...\n",
      "llama_model_loader: - kv  14:                  tokenizer.ggml.token_type arr[i32,51200]   = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\n",
      "llama_model_loader: - kv  15:                      tokenizer.ggml.merges arr[str,50000]   = [\"Ġ t\", \"Ġ a\", \"h e\", \"i n\", \"r e\",...\n",
      "llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 50256\n",
      "llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 50295\n",
      "llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 50256\n",
      "llama_model_loader: - kv  19:            tokenizer.ggml.padding_token_id u32              = 50256\n",
      "llama_model_loader: - kv  20:               general.quantization_version u32              = 2\n",
      "llama_model_loader: - type  f32:  195 tensors\n",
      "llama_model_loader: - type q4_K:   81 tensors\n",
      "llama_model_loader: - type q5_K:   32 tensors\n",
      "llama_model_loader: - type q6_K:   17 tensors\n",
      "llm_load_vocab: mismatch in special tokens definition ( 910/51200 vs 944/51200 ).\n",
      "llm_load_print_meta: format           = GGUF V3 (latest)\n",
      "llm_load_print_meta: arch             = phi2\n",
      "llm_load_print_meta: vocab type       = BPE\n",
      "llm_load_print_meta: n_vocab          = 51200\n",
      "llm_load_print_meta: n_merges         = 50000\n",
      "llm_load_print_meta: n_ctx_train      = 2048\n",
      "llm_load_print_meta: n_embd           = 2560\n",
      "llm_load_print_meta: n_head           = 32\n",
      "llm_load_print_meta: n_head_kv        = 32\n",
      "llm_load_print_meta: n_layer          = 32\n",
      "llm_load_print_meta: n_rot            = 32\n",
      "llm_load_print_meta: n_embd_head_k    = 80\n",
      "llm_load_print_meta: n_embd_head_v    = 80\n",
      "llm_load_print_meta: n_gqa            = 1\n",
      "llm_load_print_meta: n_embd_k_gqa     = 2560\n",
      "llm_load_print_meta: n_embd_v_gqa     = 2560\n",
      "llm_load_print_meta: f_norm_eps       = 1.0e-05\n",
      "llm_load_print_meta: f_norm_rms_eps   = 0.0e+00\n",
      "llm_load_print_meta: f_clamp_kqv      = 0.0e+00\n",
      "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
      "llm_load_print_meta: n_ff             = 10240\n",
      "llm_load_print_meta: n_expert         = 0\n",
      "llm_load_print_meta: n_expert_used    = 0\n",
      "llm_load_print_meta: rope scaling     = linear\n",
      "llm_load_print_meta: freq_base_train  = 10000.0\n",
      "llm_load_print_meta: freq_scale_train = 1\n",
      "llm_load_print_meta: n_yarn_orig_ctx  = 2048\n",
      "llm_load_print_meta: rope_finetuned   = unknown\n",
      "llm_load_print_meta: model type       = 3B\n",
      "llm_load_print_meta: model ftype      = Q4_K - Medium\n",
      "llm_load_print_meta: model params     = 2.78 B\n",
      "llm_load_print_meta: model size       = 1.66 GiB (5.14 BPW) \n",
      "llm_load_print_meta: general.name     = Phi2\n",
      "llm_load_print_meta: BOS token        = 50256 '<|endoftext|>'\n",
      "llm_load_print_meta: EOS token        = 50295 '<|im_end|>'\n",
      "llm_load_print_meta: UNK token        = 50256 '<|endoftext|>'\n",
      "llm_load_print_meta: PAD token        = 50256 '<|endoftext|>'\n",
      "llm_load_print_meta: LF token         = 128 'Ä'\n",
      "llm_load_tensors: ggml ctx size =    0.25 MiB\n",
      "llm_load_tensors: offloading 32 repeating layers to GPU\n",
      "llm_load_tensors: offloading non-repeating layers to GPU\n",
      "llm_load_tensors: offloaded 33/33 layers to GPU\n",
      "llm_load_tensors:        CPU buffer size =    70.31 MiB\n",
      "llm_load_tensors:     Vulkan buffer size =  1634.32 MiB\n",
      "..................................................................................\n",
      "llama_new_context_with_model: n_ctx      = 256\n",
      "llama_new_context_with_model: freq_base  = 10000.0\n",
      "llama_new_context_with_model: freq_scale = 1\n",
      "llama_kv_cache_init:     Vulkan KV buffer size =    80.00 MiB\n",
      "llama_new_context_with_model: KV self size  =   80.00 MiB, K (f16):   40.00 MiB, V (f16):   40.00 MiB\n",
      "llama_new_context_with_model: Vulkan_Host input buffer size   =     2.76 MiB\n",
      "llama_new_context_with_model:     Vulkan compute buffer size =    60.50 MiB\n",
      "llama_new_context_with_model: Vulkan_Host compute buffer size =     2.75 MiB\n",
      "llama_new_context_with_model: graph splits (measure): 3\n",
      "AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | \n",
      "Model metadata: {'tokenizer.ggml.padding_token_id': '50256', 'tokenizer.ggml.unknown_token_id': '50256', 'tokenizer.ggml.eos_token_id': '50295', 'tokenizer.ggml.bos_token_id': '50256', 'general.architecture': 'phi2', 'general.name': 'Phi2', 'phi2.context_length': '2048', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'gpt2', 'tokenizer.ggml.add_bos_token': 'false', 'phi2.embedding_length': '2560', 'phi2.attention.head_count': '32', 'phi2.attention.head_count_kv': '32', 'phi2.feed_forward_length': '10240', 'phi2.attention.layer_norm_epsilon': '0.000010', 'phi2.block_count': '32', 'phi2.rope.dimension_count': '32', 'general.file_type': '15'}\n"
     ]
    }
   ],
   "source": [
    "model = llama_cpp.Llama(\n",
    "    model_path=\"../llama.cpp/models/phi-2-orange.Q4_K_M.gguf\",\n",
    "    chat_format=\"chatml\",\n",
    "    seed=123123123,\n",
    "    n_gpu_layers=33,\n",
    "    n_ctx=256\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "e109a874-54ab-45e6-ac64-12a4885c8af3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'id': 'chatcmpl-0150121d-9b5e-45bc-845e-7d6513a70d2a', 'object': 'chat.completion', 'created': 1706567265, 'model': '../llama.cpp/models/phi-2-orange.Q4_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': \"\\nThe following text has been generated by a language model AI, and it may contain errors or inconsistencies. Please review and edit as needed:\\n\\nTitle: The Impact of COVID-19 on Global Health Care Systems and Policy Responses\\n\\nAbstract: This paper examines the impact of the COVID-19 pandemic on global health care systems and policy responses, focusing on four key aspects: preparedness, response, recovery, and resilience. It analyzes the strengths and weaknesses of different countries' health care systems in dealing with the crisis, as well as the effectiveness and challenges of various policy measures taken to contain and mitigate the spread of the virus, such as lockdowns, testing, tracing, isolation, treatment, vaccination, and international cooperation. The paper also discusses the implications of the pandemic for future health care system reforms and policy directions, highlighting the need for more investment in public health, universal health coverage, digital transformation, and global solidarity.\\n\\nKeywords: COVID-19, health......... and see and make and N See, check and a and you, and make, and this, and ' and N and D and t\"}, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 22, 'completion_tokens': 234, 'total_tokens': 256}}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "llama_print_timings:        load time =     486.34 ms\n",
      "llama_print_timings:      sample time =      60.65 ms /   234 runs   (    0.26 ms per token,  3858.01 tokens per second)\n",
      "llama_print_timings: prompt eval time =     486.30 ms /    22 tokens (   22.10 ms per token,    45.24 tokens per second)\n",
      "llama_print_timings:        eval time =    8258.92 ms /   233 runs   (   35.45 ms per token,    28.21 tokens per second)\n",
      "llama_print_timings:       total time =    9616.43 ms /   255 tokens\n"
     ]
    }
   ],
   "source": [
    "print(model.create_chat_completion(\n",
    "     messages=[{\n",
    "         \"role\": \"user\",\n",
    "         \"content\": \"what is the meaning of life?\"\n",
    "     }]\n",
    "))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "16ae7814-6576-479a-87f7-e13e1e06c2cb",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "llama_model_loader: loaded meta data with 20 key-value pairs and 325 tensors from phi-2.Q6_K.gguf (version GGUF V3 (latest))\n",
      "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n",
      "llama_model_loader: - kv   0:                       general.architecture str              = phi2\n",
      "llama_model_loader: - kv   1:                               general.name str              = Phi2\n",
      "llama_model_loader: - kv   2:                        phi2.context_length u32              = 2048\n",
      "llama_model_loader: - kv   3:                      phi2.embedding_length u32              = 2560\n",
      "llama_model_loader: - kv   4:                   phi2.feed_forward_length u32              = 10240\n",
      "llama_model_loader: - kv   5:                           phi2.block_count u32              = 32\n",
      "llama_model_loader: - kv   6:                  phi2.attention.head_count u32              = 32\n",
      "llama_model_loader: - kv   7:               phi2.attention.head_count_kv u32              = 32\n",
      "llama_model_loader: - kv   8:          phi2.attention.layer_norm_epsilon f32              = 0.000010\n",
      "llama_model_loader: - kv   9:                  phi2.rope.dimension_count u32              = 32\n",
      "llama_model_loader: - kv  10:                          general.file_type u32              = 18\n",
      "llama_model_loader: - kv  11:               tokenizer.ggml.add_bos_token bool             = false\n",
      "llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = gpt2\n",
      "llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,51200]   = [\"!\", \"\\\"\", \"#\", \"$\", \"%\", \"&\", \"'\", ...\n",
      "llama_model_loader: - kv  14:                  tokenizer.ggml.token_type arr[i32,51200]   = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\n",
      "llama_model_loader: - kv  15:                      tokenizer.ggml.merges arr[str,50000]   = [\"Ġ t\", \"Ġ a\", \"h e\", \"i n\", \"r e\",...\n",
      "llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 50256\n",
      "llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 50256\n",
      "llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 50256\n",
      "llama_model_loader: - kv  19:               general.quantization_version u32              = 2\n",
      "llama_model_loader: - type  f32:  195 tensors\n",
      "llama_model_loader: - type q6_K:  130 tensors\n",
      "llm_load_vocab: mismatch in special tokens definition ( 910/51200 vs 944/51200 ).\n",
      "llm_load_print_meta: format           = GGUF V3 (latest)\n",
      "llm_load_print_meta: arch             = phi2\n",
      "llm_load_print_meta: vocab type       = BPE\n",
      "llm_load_print_meta: n_vocab          = 51200\n",
      "llm_load_print_meta: n_merges         = 50000\n",
      "llm_load_print_meta: n_ctx_train      = 2048\n",
      "llm_load_print_meta: n_embd           = 2560\n",
      "llm_load_print_meta: n_head           = 32\n",
      "llm_load_print_meta: n_head_kv        = 32\n",
      "llm_load_print_meta: n_layer          = 32\n",
      "llm_load_print_meta: n_rot            = 32\n",
      "llm_load_print_meta: n_embd_head_k    = 80\n",
      "llm_load_print_meta: n_embd_head_v    = 80\n",
      "llm_load_print_meta: n_gqa            = 1\n",
      "llm_load_print_meta: n_embd_k_gqa     = 2560\n",
      "llm_load_print_meta: n_embd_v_gqa     = 2560\n",
      "llm_load_print_meta: f_norm_eps       = 1.0e-05\n",
      "llm_load_print_meta: f_norm_rms_eps   = 0.0e+00\n",
      "llm_load_print_meta: f_clamp_kqv      = 0.0e+00\n",
      "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
      "llm_load_print_meta: n_ff             = 10240\n",
      "llm_load_print_meta: n_expert         = 0\n",
      "llm_load_print_meta: n_expert_used    = 0\n",
      "llm_load_print_meta: rope scaling     = linear\n",
      "llm_load_print_meta: freq_base_train  = 10000.0\n",
      "llm_load_print_meta: freq_scale_train = 1\n",
      "llm_load_print_meta: n_yarn_orig_ctx  = 2048\n",
      "llm_load_print_meta: rope_finetuned   = unknown\n",
      "llm_load_print_meta: model type       = 3B\n",
      "llm_load_print_meta: model ftype      = Q6_K\n",
      "llm_load_print_meta: model params     = 2.78 B\n",
      "llm_load_print_meta: model size       = 2.13 GiB (6.57 BPW) \n",
      "llm_load_print_meta: general.name     = Phi2\n",
      "llm_load_print_meta: BOS token        = 50256 '<|endoftext|>'\n",
      "llm_load_print_meta: EOS token        = 50256 '<|endoftext|>'\n",
      "llm_load_print_meta: UNK token        = 50256 '<|endoftext|>'\n",
      "llm_load_print_meta: LF token         = 128 'Ä'\n",
      "llm_load_tensors: ggml ctx size =    0.25 MiB\n",
      "llm_load_tensors: offloading 32 repeating layers to GPU\n",
      "llm_load_tensors: offloading non-repeating layers to GPU\n",
      "llm_load_tensors: offloaded 33/33 layers to GPU\n",
      "llm_load_tensors:        CPU buffer size =   102.54 MiB\n",
      "llm_load_tensors:     Vulkan buffer size =  2074.94 MiB\n",
      ".............................................................................................\n",
      "llama_new_context_with_model: n_ctx      = 512\n",
      "llama_new_context_with_model: freq_base  = 10000.0\n",
      "llama_new_context_with_model: freq_scale = 1\n",
      "llama_kv_cache_init:     Vulkan KV buffer size =   160.00 MiB\n",
      "llama_new_context_with_model: KV self size  =  160.00 MiB, K (f16):   80.00 MiB, V (f16):   80.00 MiB\n",
      "llama_new_context_with_model: Vulkan_Host input buffer size   =     6.01 MiB\n",
      "llama_new_context_with_model:     Vulkan compute buffer size =   121.00 MiB\n",
      "llama_new_context_with_model: Vulkan_Host compute buffer size =     5.50 MiB\n",
      "llama_new_context_with_model: graph splits (measure): 3\n",
      "AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | \n",
      "Model metadata: {'tokenizer.ggml.unknown_token_id': '50256', 'tokenizer.ggml.eos_token_id': '50256', 'tokenizer.ggml.bos_token_id': '50256', 'general.architecture': 'phi2', 'general.name': 'Phi2', 'phi2.context_length': '2048', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'gpt2', 'tokenizer.ggml.add_bos_token': 'false', 'phi2.embedding_length': '2560', 'phi2.attention.head_count': '32', 'phi2.attention.head_count_kv': '32', 'phi2.feed_forward_length': '10240', 'phi2.attention.layer_norm_epsilon': '0.000010', 'phi2.block_count': '32', 'phi2.rope.dimension_count': '32', 'general.file_type': '18'}\n",
      "\n",
      "llama_print_timings:        load time =     435.47 ms\n",
      "llama_print_timings:      sample time =      32.29 ms /   128 runs   (    0.25 ms per token,  3963.83 tokens per second)\n",
      "llama_print_timings: prompt eval time =     435.44 ms /    15 tokens (   29.03 ms per token,    34.45 tokens per second)\n",
      "llama_print_timings:        eval time =    4696.98 ms /   127 runs   (   36.98 ms per token,    27.04 tokens per second)\n",
      "llama_print_timings:       total time =    5593.15 ms /   142 tokens\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'id': 'cmpl-eb1ecb21-62b7-46c0-997f-21f050cb23c5',\n",
       " 'object': 'text_completion',\n",
       " 'created': 1706567486,\n",
       " 'model': 'phi-2.Q6_K.gguf',\n",
       " 'choices': [{'text': \"Theano(TensorType, 'T', (None,)), Theano(TensorType, 't', (None, None)))\\n\\n    def test_theano_type_matching_with_constant(self):\\n        # check that constant is handled correctly in case of a tensor-like object\\n        x = T.dscalar()\\n        y = x * 2 # y is const -> not a tensor\\n        z = T.as_tensor_variable(1.0) + y\\n\\n        f = theano.function([x], z)\\n        \",\n",
       "   'index': 0,\n",
       "   'logprobs': None,\n",
       "   'finish_reason': 'length'}],\n",
       " 'usage': {'prompt_tokens': 15, 'completion_tokens': 128, 'total_tokens': 143}}"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = llama_cpp.Llama(\n",
    "    model_path=\"phi-2.Q6_K.gguf\",\n",
    "    seed=123123123,\n",
    "    n_gpu_layers=33\n",
    ")\n",
    "model.create_completion(\"\"\"\n",
    "Instruct: What is the purpose of life?\n",
    "Output:\n",
    "\"\"\", max_tokens=128)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "855fa810-2fa8-4378-be91-8d7265e0aacd",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from dolphin-2.6-mistral-7b-dpo-laser.Q3_K_M.gguf (version GGUF V3 (latest))\n",
      "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n",
      "llama_model_loader: - kv   0:                       general.architecture str              = llama\n",
      "llama_model_loader: - kv   1:                               general.name str              = cognitivecomputations_dolphin-2.6-mis...\n",
      "llama_model_loader: - kv   2:                       llama.context_length u32              = 32768\n",
      "llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096\n",
      "llama_model_loader: - kv   4:                          llama.block_count u32              = 32\n",
      "llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336\n",
      "llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128\n",
      "llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 32\n",
      "llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 8\n",
      "llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010\n",
      "llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 10000.000000\n",
      "llama_model_loader: - kv  11:                          general.file_type u32              = 12\n",
      "llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama\n",
      "llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32001]   = [\"<unk>\", \"<s>\", \"</s>\", \"<0x00>\", \"<...\n",
      "llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32001]   = [0.000000, 0.000000, 0.000000, 0.0000...\n",
      "llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32001]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n",
      "llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1\n",
      "llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2\n",
      "llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0\n",
      "llama_model_loader: - kv  19:            tokenizer.ggml.padding_token_id u32              = 2\n",
      "llama_model_loader: - kv  20:               tokenizer.ggml.add_bos_token bool             = true\n",
      "llama_model_loader: - kv  21:               tokenizer.ggml.add_eos_token bool             = false\n",
      "llama_model_loader: - kv  22:               general.quantization_version u32              = 2\n",
      "llama_model_loader: - type  f32:   65 tensors\n",
      "llama_model_loader: - type q3_K:  129 tensors\n",
      "llama_model_loader: - type q4_K:   92 tensors\n",
      "llama_model_loader: - type q5_K:    4 tensors\n",
      "llama_model_loader: - type q6_K:    1 tensors\n",
      "llm_load_vocab: special tokens definition check successful ( 260/32001 ).\n",
      "llm_load_print_meta: format           = GGUF V3 (latest)\n",
      "llm_load_print_meta: arch             = llama\n",
      "llm_load_print_meta: vocab type       = SPM\n",
      "llm_load_print_meta: n_vocab          = 32001\n",
      "llm_load_print_meta: n_merges         = 0\n",
      "llm_load_print_meta: n_ctx_train      = 32768\n",
      "llm_load_print_meta: n_embd           = 4096\n",
      "llm_load_print_meta: n_head           = 32\n",
      "llm_load_print_meta: n_head_kv        = 8\n",
      "llm_load_print_meta: n_layer          = 32\n",
      "llm_load_print_meta: n_rot            = 128\n",
      "llm_load_print_meta: n_embd_head_k    = 128\n",
      "llm_load_print_meta: n_embd_head_v    = 128\n",
      "llm_load_print_meta: n_gqa            = 4\n",
      "llm_load_print_meta: n_embd_k_gqa     = 1024\n",
      "llm_load_print_meta: n_embd_v_gqa     = 1024\n",
      "llm_load_print_meta: f_norm_eps       = 0.0e+00\n",
      "llm_load_print_meta: f_norm_rms_eps   = 1.0e-05\n",
      "llm_load_print_meta: f_clamp_kqv      = 0.0e+00\n",
      "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
      "llm_load_print_meta: n_ff             = 14336\n",
      "llm_load_print_meta: n_expert         = 0\n",
      "llm_load_print_meta: n_expert_used    = 0\n",
      "llm_load_print_meta: rope scaling     = linear\n",
      "llm_load_print_meta: freq_base_train  = 10000.0\n",
      "llm_load_print_meta: freq_scale_train = 1\n",
      "llm_load_print_meta: n_yarn_orig_ctx  = 32768\n",
      "llm_load_print_meta: rope_finetuned   = unknown\n",
      "llm_load_print_meta: model type       = 7B\n",
      "llm_load_print_meta: model ftype      = Q3_K - Medium\n",
      "llm_load_print_meta: model params     = 7.24 B\n",
      "llm_load_print_meta: model size       = 3.28 GiB (3.89 BPW) \n",
      "llm_load_print_meta: general.name     = cognitivecomputations_dolphin-2.6-mistral-7b-dpo-laser\n",
      "llm_load_print_meta: BOS token        = 1 '<s>'\n",
      "llm_load_print_meta: EOS token        = 2 '</s>'\n",
      "llm_load_print_meta: UNK token        = 0 '<unk>'\n",
      "llm_load_print_meta: PAD token        = 2 '</s>'\n",
      "llm_load_print_meta: LF token         = 13 '<0x0A>'\n",
      "llm_load_tensors: ggml ctx size =    0.22 MiB\n",
      "llm_load_tensors: offloading 32 repeating layers to GPU\n",
      "llm_load_tensors: offloading non-repeating layers to GPU\n",
      "llm_load_tensors: offloaded 33/33 layers to GPU\n",
      "llm_load_tensors:        CPU buffer size =    53.71 MiB\n",
      "llm_load_tensors:     Vulkan buffer size =  3301.56 MiB\n",
      "..................................................................................................\n",
      "llama_new_context_with_model: n_ctx      = 512\n",
      "llama_new_context_with_model: freq_base  = 10000.0\n",
      "llama_new_context_with_model: freq_scale = 1\n",
      "llama_kv_cache_init:     Vulkan KV buffer size =    64.00 MiB\n",
      "llama_new_context_with_model: KV self size  =   64.00 MiB, K (f16):   32.00 MiB, V (f16):   32.00 MiB\n",
      "llama_new_context_with_model: Vulkan_Host input buffer size   =     9.01 MiB\n",
      "llama_new_context_with_model:     Vulkan compute buffer size =    80.30 MiB\n",
      "llama_new_context_with_model: Vulkan_Host compute buffer size =     8.80 MiB\n",
      "llama_new_context_with_model: graph splits (measure): 3\n",
      "AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | \n",
      "Model metadata: {'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.padding_token_id': '2', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '10000.000000', 'llama.context_length': '32768', 'general.name': 'cognitivecomputations_dolphin-2.6-mistral-7b-dpo-laser', 'tokenizer.ggml.add_bos_token': 'true', 'llama.embedding_length': '4096', 'llama.feed_forward_length': '14336', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.dimension_count': '128', 'tokenizer.ggml.bos_token_id': '1', 'llama.attention.head_count': '32', 'llama.block_count': '32', 'llama.attention.head_count_kv': '8', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'llama', 'general.file_type': '12'}\n"
     ]
    }
   ],
   "source": [
    "model = llama_cpp.Llama(\n",
    "    model_path=\"dolphin-2.6-mistral-7b-dpo-laser.Q3_K_M.gguf\",\n",
    "    chat_format=\"chatml\",\n",
    "    seed=123123123,\n",
    "    n_gpu_layers=33\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "991304d4-4d9e-4943-b540-71461910fb00",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'id': 'chatcmpl-45703b05-3d21-4f9a-a1f5-8c658ffda26f', 'object': 'chat.completion', 'created': 1706567539, 'model': 'dolphin-2.6-mistral-7b-dpo-laser.Q3_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': \" The meaning of life is a philosophical question that has been pondered by humans for centuries. It's a deeply personal and subjective question, as each individual may have their own unique interpretation or purpose in life. Some people believe that the meaning of life lies in pursuing happiness, while others find it in helping others, achieving success, or seeking spiritual enlightenment.\\n\\nUltimately, the meaning of life is a deeply personal and subjective question that can't be definitively answered by anyone else. It's up to each individual to discover their own sense of purpose and fulfillment through their experiences, relationships, and actions.\"}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 34, 'completion_tokens': 134, 'total_tokens': 168}}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "llama_print_timings:        load time =    1198.17 ms\n",
      "llama_print_timings:      sample time =      24.32 ms /   135 runs   (    0.18 ms per token,  5550.53 tokens per second)\n",
      "llama_print_timings: prompt eval time =    1198.10 ms /    34 tokens (   35.24 ms per token,    28.38 tokens per second)\n",
      "llama_print_timings:        eval time =   42413.28 ms /   134 runs   (  316.52 ms per token,     3.16 tokens per second)\n",
      "llama_print_timings:       total time =   43938.55 ms /   168 tokens\n"
     ]
    }
   ],
   "source": [
    "print(model.create_chat_completion(\n",
    "     messages=[{\n",
    "         \"role\": \"user\",\n",
    "         \"content\": \"what is the meaning of life?\"\n",
    "     }]\n",
    "))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "40de568a-acd4-4d01-8154-772875725016",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from dolphin-2.6-mistral-7b-dpo-laser.Q3_K_M.gguf (version GGUF V3 (latest))\n",
      "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n",
      "llama_model_loader: - kv   0:                       general.architecture str              = llama\n",
      "llama_model_loader: - kv   1:                               general.name str              = cognitivecomputations_dolphin-2.6-mis...\n",
      "llama_model_loader: - kv   2:                       llama.context_length u32              = 32768\n",
      "llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096\n",
      "llama_model_loader: - kv   4:                          llama.block_count u32              = 32\n",
      "llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336\n",
      "llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128\n",
      "llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 32\n",
      "llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 8\n",
      "llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010\n",
      "llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 10000.000000\n",
      "llama_model_loader: - kv  11:                          general.file_type u32              = 12\n",
      "llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama\n",
      "llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32001]   = [\"<unk>\", \"<s>\", \"</s>\", \"<0x00>\", \"<...\n",
      "llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32001]   = [0.000000, 0.000000, 0.000000, 0.0000...\n",
      "llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32001]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n",
      "llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1\n",
      "llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2\n",
      "llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0\n",
      "llama_model_loader: - kv  19:            tokenizer.ggml.padding_token_id u32              = 2\n",
      "llama_model_loader: - kv  20:               tokenizer.ggml.add_bos_token bool             = true\n",
      "llama_model_loader: - kv  21:               tokenizer.ggml.add_eos_token bool             = false\n",
      "llama_model_loader: - kv  22:               general.quantization_version u32              = 2\n",
      "llama_model_loader: - type  f32:   65 tensors\n",
      "llama_model_loader: - type q3_K:  129 tensors\n",
      "llama_model_loader: - type q4_K:   92 tensors\n",
      "llama_model_loader: - type q5_K:    4 tensors\n",
      "llama_model_loader: - type q6_K:    1 tensors\n",
      "llm_load_vocab: special tokens definition check successful ( 260/32001 ).\n",
      "llm_load_print_meta: format           = GGUF V3 (latest)\n",
      "llm_load_print_meta: arch             = llama\n",
      "llm_load_print_meta: vocab type       = SPM\n",
      "llm_load_print_meta: n_vocab          = 32001\n",
      "llm_load_print_meta: n_merges         = 0\n",
      "llm_load_print_meta: n_ctx_train      = 32768\n",
      "llm_load_print_meta: n_embd           = 4096\n",
      "llm_load_print_meta: n_head           = 32\n",
      "llm_load_print_meta: n_head_kv        = 8\n",
      "llm_load_print_meta: n_layer          = 32\n",
      "llm_load_print_meta: n_rot            = 128\n",
      "llm_load_print_meta: n_embd_head_k    = 128\n",
      "llm_load_print_meta: n_embd_head_v    = 128\n",
      "llm_load_print_meta: n_gqa            = 4\n",
      "llm_load_print_meta: n_embd_k_gqa     = 1024\n",
      "llm_load_print_meta: n_embd_v_gqa     = 1024\n",
      "llm_load_print_meta: f_norm_eps       = 0.0e+00\n",
      "llm_load_print_meta: f_norm_rms_eps   = 1.0e-05\n",
      "llm_load_print_meta: f_clamp_kqv      = 0.0e+00\n",
      "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
      "llm_load_print_meta: n_ff             = 14336\n",
      "llm_load_print_meta: n_expert         = 0\n",
      "llm_load_print_meta: n_expert_used    = 0\n",
      "llm_load_print_meta: rope scaling     = linear\n",
      "llm_load_print_meta: freq_base_train  = 10000.0\n",
      "llm_load_print_meta: freq_scale_train = 1\n",
      "llm_load_print_meta: n_yarn_orig_ctx  = 32768\n",
      "llm_load_print_meta: rope_finetuned   = unknown\n",
      "llm_load_print_meta: model type       = 7B\n",
      "llm_load_print_meta: model ftype      = Q3_K - Medium\n",
      "llm_load_print_meta: model params     = 7.24 B\n",
      "llm_load_print_meta: model size       = 3.28 GiB (3.89 BPW) \n",
      "llm_load_print_meta: general.name     = cognitivecomputations_dolphin-2.6-mistral-7b-dpo-laser\n",
      "llm_load_print_meta: BOS token        = 1 '<s>'\n",
      "llm_load_print_meta: EOS token        = 2 '</s>'\n",
      "llm_load_print_meta: UNK token        = 0 '<unk>'\n",
      "llm_load_print_meta: PAD token        = 2 '</s>'\n",
      "llm_load_print_meta: LF token         = 13 '<0x0A>'\n",
      "llm_load_tensors: ggml ctx size =    0.11 MiB\n",
      "llm_load_tensors: offloading 0 repeating layers to GPU\n",
      "llm_load_tensors: offloaded 0/33 layers to GPU\n",
      "llm_load_tensors:        CPU buffer size =  3355.27 MiB\n",
      "..................................................................................................\n",
      "llama_new_context_with_model: n_ctx      = 512\n",
      "llama_new_context_with_model: freq_base  = 10000.0\n",
      "llama_new_context_with_model: freq_scale = 1\n",
      "llama_kv_cache_init: Vulkan_Host KV buffer size =    64.00 MiB\n",
      "llama_new_context_with_model: KV self size  =   64.00 MiB, K (f16):   32.00 MiB, V (f16):   32.00 MiB\n",
      "llama_new_context_with_model: Vulkan_Host input buffer size   =     9.01 MiB\n",
      "llama_new_context_with_model: Vulkan_Host compute buffer size =    79.20 MiB\n",
      "llama_new_context_with_model: graph splits (measure): 1\n",
      "AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | \n",
      "Model metadata: {'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.padding_token_id': '2', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '10000.000000', 'llama.context_length': '32768', 'general.name': 'cognitivecomputations_dolphin-2.6-mistral-7b-dpo-laser', 'tokenizer.ggml.add_bos_token': 'true', 'llama.embedding_length': '4096', 'llama.feed_forward_length': '14336', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.dimension_count': '128', 'tokenizer.ggml.bos_token_id': '1', 'llama.attention.head_count': '32', 'llama.block_count': '32', 'llama.attention.head_count_kv': '8', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'llama', 'general.file_type': '12'}\n"
     ]
    }
   ],
   "source": [
    "model = llama_cpp.Llama(\n",
    "    model_path=\"dolphin-2.6-mistral-7b-dpo-laser.Q3_K_M.gguf\",\n",
    "    chat_format=\"chatml\",\n",
    "    seed=123123123,\n",
    "    n_gpu_layers=0\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "1d106f5c-0cd0-49fa-9eaa-b5aadfd5c02b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Llama.generate: prefix-match hit\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'id': 'chatcmpl-0296ae59-7115-4fa2-96bd-596f21b00888', 'object': 'chat.completion', 'created': 1706567793, 'model': 'dolphin-2.6-mistral-7b-dpo-laser.Q3_K_M.gguf', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': \" The meaning of life is a philosophical question that has been pondered by humans throughout history. Different people and cultures have their own interpretations, but there isn't one universally accepted answer. Some believe that the purpose of life is to find happiness, love, or fulfillment through relationships, personal growth, or spiritual enlightenment. Others think that our purpose is to contribute positively to society by helping others, pursuing a career, or making a difference in the world.\\n\\nUltimately, the meaning of life is subjective and can vary greatly from person to person. It's up to each individual to find their own sense of purpose and fulfillment through their experiences, relationships, and personal values.\"}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 34, 'completion_tokens': 149, 'total_tokens': 183}}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "llama_print_timings:        load time =    2045.48 ms\n",
      "llama_print_timings:      sample time =      18.05 ms /   150 runs   (    0.12 ms per token,  8308.41 tokens per second)\n",
      "llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)\n",
      "llama_print_timings:        eval time =   13261.52 ms /   150 runs   (   88.41 ms per token,    11.31 tokens per second)\n",
      "llama_print_timings:       total time =   13523.55 ms /   151 tokens\n"
     ]
    }
   ],
   "source": [
    "print(model.create_chat_completion(\n",
    "     messages=[{\n",
    "         \"role\": \"user\",\n",
    "         \"content\": \"what is the meaning of life?\"\n",
    "     }]\n",
    "))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}