iwiwi/2023-09-20_tokenizers.ipynb

## 2023-09-20_tokenizers.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from epochraft import CheckpointableDataset\n",
    "from transformers import AutoTokenizer, LlamaTokenizer\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_dataset(tokenizer):\n",
    "    return (\n",
    "        CheckpointableDataset.from_files(\"s3://polyglot-ja-west/2_quality_filter/v2/cc-100/cc-100_00.jsonl\")\n",
    "        .tokenize(tokenizer, parallel=False)\n",
    "        .take(1000)  # Using first 10000 samples\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['text', 'input_ids', 'attention_mask'])\n"
     ]
    }
   ],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Llama-2-7b-hf\")\n",
    "dataset = build_dataset(tokenizer)\n",
    "sample = next(iter(dataset))  # Getting the first sample from the dataset\n",
    "print(sample.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "52 56\n"
     ]
    }
   ],
   "source": [
    "print(len(sample[\"text\"]), len(sample[\"input_ids\"]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Compression rates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate(tokenizer):\n",
    "    dataset = build_dataset(tokenizer)\n",
    "\n",
    "    n_chars = 0\n",
    "    n_tokens = 0\n",
    "    for sample in dataset:\n",
    "        n_chars += len(sample[\"text\"])\n",
    "        n_tokens += len(sample[\"input_ids\"])\n",
    "\n",
    "    print(f\"Compression rate: {n_chars / n_tokens} chars / token ({n_chars} / {n_tokens})\")\n",
    "    return n_chars / n_tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Compression rate: 0.7013822216126426 chars / token (92048 / 131238)\n",
      "Compression rate: 0.983397790645499 chars / token (92048 / 93602)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Compression rate: 0.8213291455492897 chars / token (92048 / 112072)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Compression rate: 2.048105377923147 chars / token (92048 / 44943)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/fsx/home-takiba/jp-sandbox/takiba/venv/lib/python3.11/site-packages/transformers/convert_slow_tokenizer.py:473: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Compression rate: 1.8418440851608773 chars / token (92048 / 49976)\n",
      "Compression rate: 2.0203687445127305 chars / token (92048 / 45560)\n",
      "Compression rate: 1.6374277328115272 chars / token (92048 / 56215)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Tokenizer</th>\n",
       "      <th>Vocab Size</th>\n",
       "      <th>Chars / Token</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>GPT-2</td>\n",
       "      <td>50257</td>\n",
       "      <td>0.701382</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>GPT-NeoX-20B</td>\n",
       "      <td>50254</td>\n",
       "      <td>0.983398</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Llama-2</td>\n",
       "      <td>32000</td>\n",
       "      <td>0.821329</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Nerdstash (JStableLM)</td>\n",
       "      <td>65535</td>\n",
       "      <td>2.048105</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Rinna Japanese</td>\n",
       "      <td>32000</td>\n",
       "      <td>1.841844</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Rinna Bilingual</td>\n",
       "      <td>65536</td>\n",
       "      <td>2.020369</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Elyza Llama-2</td>\n",
       "      <td>45043</td>\n",
       "      <td>1.637428</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               Tokenizer  Vocab Size  Chars / Token\n",
       "0                  GPT-2       50257       0.701382\n",
       "1           GPT-NeoX-20B       50254       0.983398\n",
       "2                Llama-2       32000       0.821329\n",
       "3  Nerdstash (JStableLM)       65535       2.048105\n",
       "4         Rinna Japanese       32000       1.841844\n",
       "5        Rinna Bilingual       65536       2.020369\n",
       "6          Elyza Llama-2       45043       1.637428"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "TOKENIZERS = [\n",
    "    (\"gpt2\", AutoTokenizer, \"GPT-2\"),\n",
    "    (\"EleutherAI/gpt-neox-20b\", AutoTokenizer, \"GPT-NeoX-20B\"),\n",
    "    (\"meta-llama/Llama-2-7b-hf\", AutoTokenizer, \"Llama-2\"),\n",
    "    (\"novelai/nerdstash-tokenizer-v1\", LlamaTokenizer, \"Nerdstash (JStableLM)\"),\n",
    "    (\"rinna/japanese-gpt-neox-3.6b\", AutoTokenizer, \"Rinna Japanese\"),\n",
    "    (\"rinna/bilingual-gpt-neox-4b\", AutoTokenizer, \"Rinna Bilingual\"),\n",
    "    (\"elyza/ELYZA-japanese-Llama-2-7b-fast\", AutoTokenizer, \"Elyza Llama-2\"),\n",
    "]\n",
    "\n",
    "def generate_row(tokenizer_url, tokenizer_cls, tokenizer_name):\n",
    "    tokenizer = tokenizer_cls.from_pretrained(tokenizer_url)\n",
    "    return {\n",
    "        \"Tokenizer\": tokenizer_name,\n",
    "        \"Vocab Size\": tokenizer.vocab_size,\n",
    "        \"Chars / Token\": evaluate(tokenizer)\n",
    "    }\n",
    "\n",
    "pd.DataFrame(\n",
    "    [\n",
    "        generate_row(*args)\n",
    "        for args in TOKENIZERS\n",
    "    ]\n",
    ")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "tinypar",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"from epochraft import CheckpointableDataset\n",
	"from transformers import AutoTokenizer, LlamaTokenizer\n",
	"import pandas as pd"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Dataset "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"def build_dataset(tokenizer):\n",
	" return (\n",
	" CheckpointableDataset.from_files(\"s3://polyglot-ja-west/2_quality_filter/v2/cc-100/cc-100_00.jsonl\")\n",
	" .tokenize(tokenizer, parallel=False)\n",
	" .take(1000) # Using first 10000 samples\n",
	" )"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"dict_keys(['text', 'input_ids', 'attention_mask'])\n"
	]
	}
	],
	"source": [
	"tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Llama-2-7b-hf\")\n",
	"dataset = build_dataset(tokenizer)\n",
	"sample = next(iter(dataset)) # Getting the first sample from the dataset\n",
	"print(sample.keys())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"52 56\n"
	]
	}
	],
	"source": [
	"print(len(sample[\"text\"]), len(sample[\"input_ids\"]))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Compression rates"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"def evaluate(tokenizer):\n",
	" dataset = build_dataset(tokenizer)\n",
	"\n",
	" n_chars = 0\n",
	" n_tokens = 0\n",
	" for sample in dataset:\n",
	" n_chars += len(sample[\"text\"])\n",
	" n_tokens += len(sample[\"input_ids\"])\n",
	"\n",
	" print(f\"Compression rate: {n_chars / n_tokens} chars / token ({n_chars} / {n_tokens})\")\n",
	" return n_chars / n_tokens"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Compression rate: 0.7013822216126426 chars / token (92048 / 131238)\n",
	"Compression rate: 0.983397790645499 chars / token (92048 / 93602)\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Compression rate: 0.8213291455492897 chars / token (92048 / 112072)\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Compression rate: 2.048105377923147 chars / token (92048 / 44943)\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/fsx/home-takiba/jp-sandbox/takiba/venv/lib/python3.11/site-packages/transformers/convert_slow_tokenizer.py:473: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.\n",
	" warnings.warn(\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Compression rate: 1.8418440851608773 chars / token (92048 / 49976)\n",
	"Compression rate: 2.0203687445127305 chars / token (92048 / 45560)\n",
	"Compression rate: 1.6374277328115272 chars / token (92048 / 56215)\n"
	]
	},
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>Tokenizer</th>\n",
	" <th>Vocab Size</th>\n",
	" <th>Chars / Token</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>GPT-2</td>\n",
	" <td>50257</td>\n",
	" <td>0.701382</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>GPT-NeoX-20B</td>\n",
	" <td>50254</td>\n",
	" <td>0.983398</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>Llama-2</td>\n",
	" <td>32000</td>\n",
	" <td>0.821329</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>Nerdstash (JStableLM)</td>\n",
	" <td>65535</td>\n",
	" <td>2.048105</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>Rinna Japanese</td>\n",
	" <td>32000</td>\n",
	" <td>1.841844</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5</th>\n",
	" <td>Rinna Bilingual</td>\n",
	" <td>65536</td>\n",
	" <td>2.020369</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>6</th>\n",
	" <td>Elyza Llama-2</td>\n",
	" <td>45043</td>\n",
	" <td>1.637428</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" Tokenizer Vocab Size Chars / Token\n",
	"0 GPT-2 50257 0.701382\n",
	"1 GPT-NeoX-20B 50254 0.983398\n",
	"2 Llama-2 32000 0.821329\n",
	"3 Nerdstash (JStableLM) 65535 2.048105\n",
	"4 Rinna Japanese 32000 1.841844\n",
	"5 Rinna Bilingual 65536 2.020369\n",
	"6 Elyza Llama-2 45043 1.637428"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"TOKENIZERS = [\n",
	" (\"gpt2\", AutoTokenizer, \"GPT-2\"),\n",
	" (\"EleutherAI/gpt-neox-20b\", AutoTokenizer, \"GPT-NeoX-20B\"),\n",
	" (\"meta-llama/Llama-2-7b-hf\", AutoTokenizer, \"Llama-2\"),\n",
	" (\"novelai/nerdstash-tokenizer-v1\", LlamaTokenizer, \"Nerdstash (JStableLM)\"),\n",
	" (\"rinna/japanese-gpt-neox-3.6b\", AutoTokenizer, \"Rinna Japanese\"),\n",
	" (\"rinna/bilingual-gpt-neox-4b\", AutoTokenizer, \"Rinna Bilingual\"),\n",
	" (\"elyza/ELYZA-japanese-Llama-2-7b-fast\", AutoTokenizer, \"Elyza Llama-2\"),\n",
	"]\n",
	"\n",
	"def generate_row(tokenizer_url, tokenizer_cls, tokenizer_name):\n",
	" tokenizer = tokenizer_cls.from_pretrained(tokenizer_url)\n",
	" return {\n",
	" \"Tokenizer\": tokenizer_name,\n",
	" \"Vocab Size\": tokenizer.vocab_size,\n",
	" \"Chars / Token\": evaluate(tokenizer)\n",
	" }\n",
	"\n",
	"pd.DataFrame(\n",
	" [\n",
	" generate_row(*args)\n",
	" for args in TOKENIZERS\n",
	" ]\n",
	")\n"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "tinypar",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.11.4"
	},
	"orig_nbformat": 4
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}