Skip to content

Instantly share code, notes, and snippets.

@viniciusarruda
Last active July 27, 2023 22:19
Show Gist options
  • Save viniciusarruda/f0903aa56ee0747e13c50c271d4c4120 to your computer and use it in GitHub Desktop.
Save viniciusarruda/f0903aa56ee0747e13c50c271d4c4120 to your computer and use it in GitHub Desktop.
Compare llama.cpp vs Meta tokenization for LLaMA v1.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | \n"
]
}
],
"source": [
"import os\n",
"import llama_cpp\n",
"from llama_cpp import Llama\n",
"from tokenizer import Tokenizer\n",
"\n",
"model_path = os.path.join(os.environ[\"MODELS_PATH\"], \"Meta\", \"LLaMA\", \"7B\", \"ggml-model-f16.bin\")\n",
"tokenizer_path = os.path.join(os.environ[\"MODELS_PATH\"], \"Meta\", \"LLaMA2\", \"tokenizer.model\")\n",
"\n",
"llama_cpp_llm = Llama(model_path=model_path)\n",
"meta_tokenizer = Tokenizer(model_path=tokenizer_path)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def get_llama_ccp_vocab_id_to_piece():\n",
" lparams = llama_cpp.llama_context_default_params()\n",
"\n",
" ctx = llama_cpp.llama_init_from_file(str(model_path).encode(\"utf8\"), lparams)\n",
"\n",
" n_vocab = llama_cpp.llama_n_vocab(ctx)\n",
"\n",
" strings = (llama_cpp.c_char_p * n_vocab)()\n",
" scores = (llama_cpp.c_float * n_vocab)()\n",
" n_vocab = llama_cpp.c_int(n_vocab)\n",
"\n",
" assert llama_cpp.llama_get_vocab(llama_cpp.llama_context_p(ctx), strings, scores, n_vocab) == n_vocab.value\n",
"\n",
" return strings[:]\n",
"\n",
"llama_ccp_id_to_piece = get_llama_ccp_vocab_id_to_piece()\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"For prompt \"Hello world\":\n",
"llama.cpp tokenizer: [10994, 3186]\n",
"llama.cpp tokenizer + space: [15043, 3186]\n",
"Meta tokenizer: [15043, 3186]\n",
"\n",
"For prompt \" Hello world\":\n",
"llama.cpp tokenizer: [15043, 3186]\n",
"llama.cpp tokenizer + space: [29871, 15043, 3186]\n",
"Meta tokenizer: [29871, 15043, 3186]\n",
"\n"
]
}
],
"source": [
"for prompt in [\"Hello world\", \" Hello world\"]:\n",
" print(f\"For prompt \\\"{prompt}\\\":\")\n",
" print(\"llama.cpp tokenizer:\", llama_cpp_llm.tokenize(text=prompt.encode(\"utf-8\"), add_bos=False))\n",
" print(\"llama.cpp tokenizer + space:\", llama_cpp_llm.tokenize(text=b' ' + prompt.encode(\"utf-8\"), add_bos=False))\n",
" print(\"Meta tokenizer:\", meta_tokenizer.encode(prompt, bos=False, eos=False))\n",
" print()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Doing the reverse process:\n",
"For tokens \"[10994, 3186]\":\n",
"llama.cpp tokenizer: |b'Hello world'|\n",
"Meta tokenizer: |Hello world|\n",
"\n",
"For tokens \"[15043, 3186]\":\n",
"llama.cpp tokenizer: |b' Hello world'|\n",
"Meta tokenizer: |Hello world|\n",
"\n",
"For tokens \"[29871, 15043, 3186]\":\n",
"llama.cpp tokenizer: |b' Hello world'|\n",
"Meta tokenizer: | Hello world|\n",
"\n",
"*Adding | to ease visualization.\n"
]
}
],
"source": [
"print(\"Doing the reverse process:\")\n",
"for tokens in [[10994, 3186], [15043, 3186], [29871, 15043, 3186]]:\n",
" print(f\"For tokens \\\"{tokens}\\\":\")\n",
" print(\"llama.cpp tokenizer:\", f\"|{llama_cpp_llm.detokenize(tokens)}|\")\n",
" print(\"Meta tokenizer:\", f\"|{meta_tokenizer.decode(tokens)}|\")\n",
" print()\n",
"\n",
"print(\"*Adding | to ease visualization.\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Looking the id_to_piece for llama.cpp:\n",
"id 10994 |b'Hello'|\n",
"id 3186 |b' world'|\n",
"id 15043 |b' Hello'|\n",
"id 29871 |b' '|\n",
"\n",
"Looking the id_to_piece for Meta:\n",
"id 10994 |Hello|\n",
"id 3186 |▁world|\n",
"id 15043 |▁Hello|\n",
"id 29871 |▁|\n",
"\n",
"*Adding | to ease visualization.\n",
"Note, the 29871 token is not the underline character but \"\\u2581\" (https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol).\n",
"True\n"
]
}
],
"source": [
"print(\"Looking the id_to_piece for llama.cpp:\")\n",
"for i in [10994, 3186, 15043, 29871]:\n",
" print(f\"id {i}\", f\"|{llama_ccp_id_to_piece[i]}|\")\n",
"\n",
"print(\"\\nLooking the id_to_piece for Meta:\")\n",
"for i in [10994, 3186, 15043, 29871]:\n",
" print(f\"id {i}\", f\"|{meta_tokenizer.sp_model.id_to_piece(i)}|\")\n",
"\n",
"print(\"\\n*Adding | to ease visualization.\")\n",
"\n",
"print(\"Note, the 29871 token is not the underline character but \\\"\\\\u2581\\\" (https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol).\")\n",
"print(meta_tokenizer.sp_model.id_to_piece(29871) == \"\\u2581\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using the llama.cpp detokenizer:\n",
"id 10994 |b'Hello'|\n",
"id 3186 |b' world'|\n",
"id 15043 |b' Hello'|\n",
"id 29871 |b' '|\n",
"\n",
"Using the Meta detokenizer:\n",
"id 10994 |Hello|\n",
"id 3186 |world|\n",
"id 15043 |Hello|\n",
"id 29871 ||\n"
]
}
],
"source": [
"print(\"Using the llama.cpp detokenizer:\")\n",
"for i in [10994, 3186, 15043, 29871]:\n",
" print(f\"id {i}\", f\"|{llama_cpp_llm.detokenize([i])}|\")\n",
"\n",
"print(\"\\nUsing the Meta detokenizer:\")\n",
"for i in [10994, 3186, 15043, 29871]:\n",
" print(f\"id {i}\", f\"|{meta_tokenizer.decode([i])}|\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment