viniciusarruda/compare_tokenization.ipynb

## compare_tokenization.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | \n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import llama_cpp\n",
    "from llama_cpp import Llama\n",
    "from tokenizer import Tokenizer\n",
    "\n",
    "model_path = os.path.join(os.environ[\"MODELS_PATH\"], \"Meta\", \"LLaMA\", \"7B\", \"ggml-model-f16.bin\")\n",
    "tokenizer_path = os.path.join(os.environ[\"MODELS_PATH\"], \"Meta\", \"LLaMA2\", \"tokenizer.model\")\n",
    "\n",
    "llama_cpp_llm = Llama(model_path=model_path)\n",
    "meta_tokenizer = Tokenizer(model_path=tokenizer_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_llama_ccp_vocab_id_to_piece():\n",
    "    lparams = llama_cpp.llama_context_default_params()\n",
    "\n",
    "    ctx = llama_cpp.llama_init_from_file(str(model_path).encode(\"utf8\"), lparams)\n",
    "\n",
    "    n_vocab = llama_cpp.llama_n_vocab(ctx)\n",
    "\n",
    "    strings = (llama_cpp.c_char_p * n_vocab)()\n",
    "    scores = (llama_cpp.c_float * n_vocab)()\n",
    "    n_vocab = llama_cpp.c_int(n_vocab)\n",
    "\n",
    "    assert llama_cpp.llama_get_vocab(llama_cpp.llama_context_p(ctx), strings, scores, n_vocab) == n_vocab.value\n",
    "\n",
    "    return strings[:]\n",
    "\n",
    "llama_ccp_id_to_piece = get_llama_ccp_vocab_id_to_piece()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "For prompt \"Hello world\":\n",
      "llama.cpp tokenizer: [10994, 3186]\n",
      "llama.cpp tokenizer + space: [15043, 3186]\n",
      "Meta tokenizer: [15043, 3186]\n",
      "\n",
      "For prompt \" Hello world\":\n",
      "llama.cpp tokenizer: [15043, 3186]\n",
      "llama.cpp tokenizer + space: [29871, 15043, 3186]\n",
      "Meta tokenizer: [29871, 15043, 3186]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for prompt in [\"Hello world\", \" Hello world\"]:\n",
    "    print(f\"For prompt \\\"{prompt}\\\":\")\n",
    "    print(\"llama.cpp tokenizer:\", llama_cpp_llm.tokenize(text=prompt.encode(\"utf-8\"), add_bos=False))\n",
    "    print(\"llama.cpp tokenizer + space:\", llama_cpp_llm.tokenize(text=b' ' + prompt.encode(\"utf-8\"), add_bos=False))\n",
    "    print(\"Meta tokenizer:\", meta_tokenizer.encode(prompt, bos=False, eos=False))\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Doing the reverse process:\n",
      "For tokens \"[10994, 3186]\":\n",
      "llama.cpp tokenizer: |b'Hello world'|\n",
      "Meta tokenizer: |Hello world|\n",
      "\n",
      "For tokens \"[15043, 3186]\":\n",
      "llama.cpp tokenizer: |b' Hello world'|\n",
      "Meta tokenizer: |Hello world|\n",
      "\n",
      "For tokens \"[29871, 15043, 3186]\":\n",
      "llama.cpp tokenizer: |b'  Hello world'|\n",
      "Meta tokenizer: | Hello world|\n",
      "\n",
      "*Adding | to ease visualization.\n"
     ]
    }
   ],
   "source": [
    "print(\"Doing the reverse process:\")\n",
    "for tokens in [[10994, 3186], [15043, 3186], [29871, 15043, 3186]]:\n",
    "    print(f\"For tokens \\\"{tokens}\\\":\")\n",
    "    print(\"llama.cpp tokenizer:\", f\"|{llama_cpp_llm.detokenize(tokens)}|\")\n",
    "    print(\"Meta tokenizer:\", f\"|{meta_tokenizer.decode(tokens)}|\")\n",
    "    print()\n",
    "\n",
    "print(\"*Adding | to ease visualization.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking the id_to_piece for llama.cpp:\n",
      "id 10994 |b'Hello'|\n",
      "id 3186 |b' world'|\n",
      "id 15043 |b' Hello'|\n",
      "id 29871 |b' '|\n",
      "\n",
      "Looking the id_to_piece for Meta:\n",
      "id 10994 |Hello|\n",
      "id 3186 |▁world|\n",
      "id 15043 |▁Hello|\n",
      "id 29871 |▁|\n",
      "\n",
      "*Adding | to ease visualization.\n",
      "Note, the 29871 token is not the underline character but \"\\u2581\" (https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol).\n",
      "True\n"
     ]
    }
   ],
   "source": [
    "print(\"Looking the id_to_piece for llama.cpp:\")\n",
    "for i in [10994, 3186, 15043, 29871]:\n",
    "    print(f\"id {i}\", f\"|{llama_ccp_id_to_piece[i]}|\")\n",
    "\n",
    "print(\"\\nLooking the id_to_piece for Meta:\")\n",
    "for i in [10994, 3186, 15043, 29871]:\n",
    "    print(f\"id {i}\", f\"|{meta_tokenizer.sp_model.id_to_piece(i)}|\")\n",
    "\n",
    "print(\"\\n*Adding | to ease visualization.\")\n",
    "\n",
    "print(\"Note, the 29871 token is not the underline character but \\\"\\\\u2581\\\" (https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol).\")\n",
    "print(meta_tokenizer.sp_model.id_to_piece(29871) == \"\\u2581\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using the llama.cpp detokenizer:\n",
      "id 10994 |b'Hello'|\n",
      "id 3186 |b' world'|\n",
      "id 15043 |b' Hello'|\n",
      "id 29871 |b' '|\n",
      "\n",
      "Using the Meta detokenizer:\n",
      "id 10994 |Hello|\n",
      "id 3186 |world|\n",
      "id 15043 |Hello|\n",
      "id 29871 ||\n"
     ]
    }
   ],
   "source": [
    "print(\"Using the llama.cpp detokenizer:\")\n",
    "for i in [10994, 3186, 15043, 29871]:\n",
    "    print(f\"id {i}\", f\"|{llama_cpp_llm.detokenize([i])}|\")\n",
    "\n",
    "print(\"\\nUsing the Meta detokenizer:\")\n",
    "for i in [10994, 3186, 15043, 29871]:\n",
    "    print(f\"id {i}\", f\"|{meta_tokenizer.decode([i])}|\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"AVX = 1 \| AVX2 = 1 \| AVX512 = 0 \| AVX512_VBMI = 0 \| AVX512_VNNI = 0 \| FMA = 1 \| NEON = 0 \| ARM_FMA = 0 \| F16C = 1 \| FP16_VA = 0 \| WASM_SIMD = 0 \| BLAS = 0 \| SSE3 = 1 \| VSX = 0 \| \n"
	]
	}
	],
	"source": [
	"import os\n",
	"import llama_cpp\n",
	"from llama_cpp import Llama\n",
	"from tokenizer import Tokenizer\n",
	"\n",
	"model_path = os.path.join(os.environ[\"MODELS_PATH\"], \"Meta\", \"LLaMA\", \"7B\", \"ggml-model-f16.bin\")\n",
	"tokenizer_path = os.path.join(os.environ[\"MODELS_PATH\"], \"Meta\", \"LLaMA2\", \"tokenizer.model\")\n",
	"\n",
	"llama_cpp_llm = Llama(model_path=model_path)\n",
	"meta_tokenizer = Tokenizer(model_path=tokenizer_path)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"def get_llama_ccp_vocab_id_to_piece():\n",
	" lparams = llama_cpp.llama_context_default_params()\n",
	"\n",
	" ctx = llama_cpp.llama_init_from_file(str(model_path).encode(\"utf8\"), lparams)\n",
	"\n",
	" n_vocab = llama_cpp.llama_n_vocab(ctx)\n",
	"\n",
	" strings = (llama_cpp.c_char_p * n_vocab)()\n",
	" scores = (llama_cpp.c_float * n_vocab)()\n",
	" n_vocab = llama_cpp.c_int(n_vocab)\n",
	"\n",
	" assert llama_cpp.llama_get_vocab(llama_cpp.llama_context_p(ctx), strings, scores, n_vocab) == n_vocab.value\n",
	"\n",
	" return strings[:]\n",
	"\n",
	"llama_ccp_id_to_piece = get_llama_ccp_vocab_id_to_piece()\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"For prompt \"Hello world\":\n",
	"llama.cpp tokenizer: [10994, 3186]\n",
	"llama.cpp tokenizer + space: [15043, 3186]\n",
	"Meta tokenizer: [15043, 3186]\n",
	"\n",
	"For prompt \" Hello world\":\n",
	"llama.cpp tokenizer: [15043, 3186]\n",
	"llama.cpp tokenizer + space: [29871, 15043, 3186]\n",
	"Meta tokenizer: [29871, 15043, 3186]\n",
	"\n"
	]
	}
	],
	"source": [
	"for prompt in [\"Hello world\", \" Hello world\"]:\n",
	" print(f\"For prompt \\\"{prompt}\\\":\")\n",
	" print(\"llama.cpp tokenizer:\", llama_cpp_llm.tokenize(text=prompt.encode(\"utf-8\"), add_bos=False))\n",
	" print(\"llama.cpp tokenizer + space:\", llama_cpp_llm.tokenize(text=b' ' + prompt.encode(\"utf-8\"), add_bos=False))\n",
	" print(\"Meta tokenizer:\", meta_tokenizer.encode(prompt, bos=False, eos=False))\n",
	" print()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Doing the reverse process:\n",
	"For tokens \"[10994, 3186]\":\n",
	"llama.cpp tokenizer: \|b'Hello world'\|\n",
	"Meta tokenizer: \|Hello world\|\n",
	"\n",
	"For tokens \"[15043, 3186]\":\n",
	"llama.cpp tokenizer: \|b' Hello world'\|\n",
	"Meta tokenizer: \|Hello world\|\n",
	"\n",
	"For tokens \"[29871, 15043, 3186]\":\n",
	"llama.cpp tokenizer: \|b' Hello world'\|\n",
	"Meta tokenizer: \| Hello world\|\n",
	"\n",
	"*Adding \| to ease visualization.\n"
	]
	}
	],
	"source": [
	"print(\"Doing the reverse process:\")\n",
	"for tokens in [[10994, 3186], [15043, 3186], [29871, 15043, 3186]]:\n",
	" print(f\"For tokens \\\"{tokens}\\\":\")\n",
	" print(\"llama.cpp tokenizer:\", f\"\|{llama_cpp_llm.detokenize(tokens)}\|\")\n",
	" print(\"Meta tokenizer:\", f\"\|{meta_tokenizer.decode(tokens)}\|\")\n",
	" print()\n",
	"\n",
	"print(\"*Adding \| to ease visualization.\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Looking the id_to_piece for llama.cpp:\n",
	"id 10994 \|b'Hello'\|\n",
	"id 3186 \|b' world'\|\n",
	"id 15043 \|b' Hello'\|\n",
	"id 29871 \|b' '\|\n",
	"\n",
	"Looking the id_to_piece for Meta:\n",
	"id 10994 \|Hello\|\n",
	"id 3186 \|▁world\|\n",
	"id 15043 \|▁Hello\|\n",
	"id 29871 \|▁\|\n",
	"\n",
	"*Adding \| to ease visualization.\n",
	"Note, the 29871 token is not the underline character but \"\\u2581\" (https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol).\n",
	"True\n"
	]
	}
	],
	"source": [
	"print(\"Looking the id_to_piece for llama.cpp:\")\n",
	"for i in [10994, 3186, 15043, 29871]:\n",
	" print(f\"id {i}\", f\"\|{llama_ccp_id_to_piece[i]}\|\")\n",
	"\n",
	"print(\"\\nLooking the id_to_piece for Meta:\")\n",
	"for i in [10994, 3186, 15043, 29871]:\n",
	" print(f\"id {i}\", f\"\|{meta_tokenizer.sp_model.id_to_piece(i)}\|\")\n",
	"\n",
	"print(\"\\n*Adding \| to ease visualization.\")\n",
	"\n",
	"print(\"Note, the 29871 token is not the underline character but \\\"\\\\u2581\\\" (https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol).\")\n",
	"print(meta_tokenizer.sp_model.id_to_piece(29871) == \"\\u2581\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Using the llama.cpp detokenizer:\n",
	"id 10994 \|b'Hello'\|\n",
	"id 3186 \|b' world'\|\n",
	"id 15043 \|b' Hello'\|\n",
	"id 29871 \|b' '\|\n",
	"\n",
	"Using the Meta detokenizer:\n",
	"id 10994 \|Hello\|\n",
	"id 3186 \|world\|\n",
	"id 15043 \|Hello\|\n",
	"id 29871 \|\|\n"
	]
	}
	],
	"source": [
	"print(\"Using the llama.cpp detokenizer:\")\n",
	"for i in [10994, 3186, 15043, 29871]:\n",
	" print(f\"id {i}\", f\"\|{llama_cpp_llm.detokenize([i])}\|\")\n",
	"\n",
	"print(\"\\nUsing the Meta detokenizer:\")\n",
	"for i in [10994, 3186, 15043, 29871]:\n",
	" print(f\"id {i}\", f\"\|{meta_tokenizer.decode([i])}\|\")"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": ".venv",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.9"
	},
	"orig_nbformat": 4
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}