Last active
July 27, 2023 22:19
-
-
Save viniciusarruda/f0903aa56ee0747e13c50c271d4c4120 to your computer and use it in GitHub Desktop.
Compare llama.cpp vs Meta tokenization for LLaMA v1.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | \n" | |
] | |
} | |
], | |
"source": [ | |
"import os\n", | |
"import llama_cpp\n", | |
"from llama_cpp import Llama\n", | |
"from tokenizer import Tokenizer\n", | |
"\n", | |
"model_path = os.path.join(os.environ[\"MODELS_PATH\"], \"Meta\", \"LLaMA\", \"7B\", \"ggml-model-f16.bin\")\n", | |
"tokenizer_path = os.path.join(os.environ[\"MODELS_PATH\"], \"Meta\", \"LLaMA2\", \"tokenizer.model\")\n", | |
"\n", | |
"llama_cpp_llm = Llama(model_path=model_path)\n", | |
"meta_tokenizer = Tokenizer(model_path=tokenizer_path)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def get_llama_ccp_vocab_id_to_piece():\n", | |
" lparams = llama_cpp.llama_context_default_params()\n", | |
"\n", | |
" ctx = llama_cpp.llama_init_from_file(str(model_path).encode(\"utf8\"), lparams)\n", | |
"\n", | |
" n_vocab = llama_cpp.llama_n_vocab(ctx)\n", | |
"\n", | |
" strings = (llama_cpp.c_char_p * n_vocab)()\n", | |
" scores = (llama_cpp.c_float * n_vocab)()\n", | |
" n_vocab = llama_cpp.c_int(n_vocab)\n", | |
"\n", | |
" assert llama_cpp.llama_get_vocab(llama_cpp.llama_context_p(ctx), strings, scores, n_vocab) == n_vocab.value\n", | |
"\n", | |
" return strings[:]\n", | |
"\n", | |
"llama_ccp_id_to_piece = get_llama_ccp_vocab_id_to_piece()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"For prompt \"Hello world\":\n", | |
"llama.cpp tokenizer: [10994, 3186]\n", | |
"llama.cpp tokenizer + space: [15043, 3186]\n", | |
"Meta tokenizer: [15043, 3186]\n", | |
"\n", | |
"For prompt \" Hello world\":\n", | |
"llama.cpp tokenizer: [15043, 3186]\n", | |
"llama.cpp tokenizer + space: [29871, 15043, 3186]\n", | |
"Meta tokenizer: [29871, 15043, 3186]\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"for prompt in [\"Hello world\", \" Hello world\"]:\n", | |
" print(f\"For prompt \\\"{prompt}\\\":\")\n", | |
" print(\"llama.cpp tokenizer:\", llama_cpp_llm.tokenize(text=prompt.encode(\"utf-8\"), add_bos=False))\n", | |
" print(\"llama.cpp tokenizer + space:\", llama_cpp_llm.tokenize(text=b' ' + prompt.encode(\"utf-8\"), add_bos=False))\n", | |
" print(\"Meta tokenizer:\", meta_tokenizer.encode(prompt, bos=False, eos=False))\n", | |
" print()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Doing the reverse process:\n", | |
"For tokens \"[10994, 3186]\":\n", | |
"llama.cpp tokenizer: |b'Hello world'|\n", | |
"Meta tokenizer: |Hello world|\n", | |
"\n", | |
"For tokens \"[15043, 3186]\":\n", | |
"llama.cpp tokenizer: |b' Hello world'|\n", | |
"Meta tokenizer: |Hello world|\n", | |
"\n", | |
"For tokens \"[29871, 15043, 3186]\":\n", | |
"llama.cpp tokenizer: |b' Hello world'|\n", | |
"Meta tokenizer: | Hello world|\n", | |
"\n", | |
"*Adding | to ease visualization.\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"Doing the reverse process:\")\n", | |
"for tokens in [[10994, 3186], [15043, 3186], [29871, 15043, 3186]]:\n", | |
" print(f\"For tokens \\\"{tokens}\\\":\")\n", | |
" print(\"llama.cpp tokenizer:\", f\"|{llama_cpp_llm.detokenize(tokens)}|\")\n", | |
" print(\"Meta tokenizer:\", f\"|{meta_tokenizer.decode(tokens)}|\")\n", | |
" print()\n", | |
"\n", | |
"print(\"*Adding | to ease visualization.\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Looking the id_to_piece for llama.cpp:\n", | |
"id 10994 |b'Hello'|\n", | |
"id 3186 |b' world'|\n", | |
"id 15043 |b' Hello'|\n", | |
"id 29871 |b' '|\n", | |
"\n", | |
"Looking the id_to_piece for Meta:\n", | |
"id 10994 |Hello|\n", | |
"id 3186 |▁world|\n", | |
"id 15043 |▁Hello|\n", | |
"id 29871 |▁|\n", | |
"\n", | |
"*Adding | to ease visualization.\n", | |
"Note, the 29871 token is not the underline character but \"\\u2581\" (https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol).\n", | |
"True\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"Looking the id_to_piece for llama.cpp:\")\n", | |
"for i in [10994, 3186, 15043, 29871]:\n", | |
" print(f\"id {i}\", f\"|{llama_ccp_id_to_piece[i]}|\")\n", | |
"\n", | |
"print(\"\\nLooking the id_to_piece for Meta:\")\n", | |
"for i in [10994, 3186, 15043, 29871]:\n", | |
" print(f\"id {i}\", f\"|{meta_tokenizer.sp_model.id_to_piece(i)}|\")\n", | |
"\n", | |
"print(\"\\n*Adding | to ease visualization.\")\n", | |
"\n", | |
"print(\"Note, the 29871 token is not the underline character but \\\"\\\\u2581\\\" (https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol).\")\n", | |
"print(meta_tokenizer.sp_model.id_to_piece(29871) == \"\\u2581\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Using the llama.cpp detokenizer:\n", | |
"id 10994 |b'Hello'|\n", | |
"id 3186 |b' world'|\n", | |
"id 15043 |b' Hello'|\n", | |
"id 29871 |b' '|\n", | |
"\n", | |
"Using the Meta detokenizer:\n", | |
"id 10994 |Hello|\n", | |
"id 3186 |world|\n", | |
"id 15043 |Hello|\n", | |
"id 29871 ||\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"Using the llama.cpp detokenizer:\")\n", | |
"for i in [10994, 3186, 15043, 29871]:\n", | |
" print(f\"id {i}\", f\"|{llama_cpp_llm.detokenize([i])}|\")\n", | |
"\n", | |
"print(\"\\nUsing the Meta detokenizer:\")\n", | |
"for i in [10994, 3186, 15043, 29871]:\n", | |
" print(f\"id {i}\", f\"|{meta_tokenizer.decode([i])}|\")" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": ".venv", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.9" | |
}, | |
"orig_nbformat": 4 | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment