gromgull/BPE.ipynb

## BPE.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "4d0db2a2",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-08-30 10:13:27.900353: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4424ba5c3b5442f2932858d518ded7a5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a380316b33674902878f918db676cbbf",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "906fe17299b147da946166486d4f6f63",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "601bc6160bdd447882ec2769dafa6606",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7d429e99",
   "metadata": {},
   "outputs": [],
   "source": [
    "corpus = [\n",
    "    \"This is the Hugging Face Course.\",\n",
    "    \"This chapter is about tokenization.\",\n",
    "    \"This section shows several tokenizer algorithbms.\",\n",
    "    \"Hopefully, you will be able to understand how they are trained and generate tokens.\",\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "862da2c5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Gunnar', (0, 6)),\n",
       " ('Ġ', (6, 7)),\n",
       " ('Ġeats', (7, 12)),\n",
       " ('Ċ', (12, 13)),\n",
       " ('cake', (13, 17)),\n",
       " ('.', (17, 18))]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(\"Gunnar  eats\\ncake.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "be1a4896",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "defaultdict(<class 'int'>, {'This': 3, 'Ġis': 2, 'Ġthe': 1, 'ĠHugging': 1, 'ĠFace': 1, 'ĠCourse': 1, '.': 4, 'Ġchapter': 1, 'Ġabout': 1, 'Ġtokenization': 1, 'Ġsection': 1, 'Ġshows': 1, 'Ġseveral': 1, 'Ġtokenizer': 1, 'Ġalgorithms': 1, 'Hopefully': 1, ',': 1, 'Ġyou': 1, 'Ġwill': 1, 'Ġbe': 1, 'Ġable': 1, 'Ġto': 1, 'Ġunderstand': 1, 'Ġhow': 1, 'Ġthey': 1, 'Ġare': 1, 'Ġtrained': 1, 'Ġand': 1, 'Ġgenerate': 1, 'Ġtokens': 1})\n"
     ]
    }
   ],
   "source": [
    "from collections import defaultdict\n",
    "\n",
    "word_freqs = defaultdict(int)\n",
    "\n",
    "for text in corpus:\n",
    "    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)\n",
    "    new_words = [word for word, offset in words_with_offsets]\n",
    "    for word in new_words:\n",
    "        word_freqs[word] += 1\n",
    "\n",
    "print(word_freqs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "263b8335",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ']\n"
     ]
    }
   ],
   "source": [
    "alphabet = []\n",
    "\n",
    "for word in word_freqs.keys():\n",
    "    for letter in word:\n",
    "        if letter not in alphabet:\n",
    "            alphabet.append(letter)\n",
    "alphabet.sort()\n",
    "\n",
    "print(alphabet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "39678c7c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "31"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab = [\"<|endoftext|>\"] + alphabet.copy()\n",
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "16c8ecf2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'This': ['T', 'h', 'i', 's'],\n",
       " 'Ġis': ['Ġ', 'i', 's'],\n",
       " 'Ġthe': ['Ġ', 't', 'h', 'e'],\n",
       " 'ĠHugging': ['Ġ', 'H', 'u', 'g', 'g', 'i', 'n', 'g'],\n",
       " 'ĠFace': ['Ġ', 'F', 'a', 'c', 'e'],\n",
       " 'ĠCourse': ['Ġ', 'C', 'o', 'u', 'r', 's', 'e'],\n",
       " '.': ['.'],\n",
       " 'Ġchapter': ['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r'],\n",
       " 'Ġabout': ['Ġ', 'a', 'b', 'o', 'u', 't'],\n",
       " 'Ġtokenization': ['Ġ',\n",
       "  't',\n",
       "  'o',\n",
       "  'k',\n",
       "  'e',\n",
       "  'n',\n",
       "  'i',\n",
       "  'z',\n",
       "  'a',\n",
       "  't',\n",
       "  'i',\n",
       "  'o',\n",
       "  'n'],\n",
       " 'Ġsection': ['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n'],\n",
       " 'Ġshows': ['Ġ', 's', 'h', 'o', 'w', 's'],\n",
       " 'Ġseveral': ['Ġ', 's', 'e', 'v', 'e', 'r', 'a', 'l'],\n",
       " 'Ġtokenizer': ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r'],\n",
       " 'Ġalgorithms': ['Ġ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's'],\n",
       " 'Hopefully': ['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y'],\n",
       " ',': [','],\n",
       " 'Ġyou': ['Ġ', 'y', 'o', 'u'],\n",
       " 'Ġwill': ['Ġ', 'w', 'i', 'l', 'l'],\n",
       " 'Ġbe': ['Ġ', 'b', 'e'],\n",
       " 'Ġable': ['Ġ', 'a', 'b', 'l', 'e'],\n",
       " 'Ġto': ['Ġ', 't', 'o'],\n",
       " 'Ġunderstand': ['Ġ', 'u', 'n', 'd', 'e', 'r', 's', 't', 'a', 'n', 'd'],\n",
       " 'Ġhow': ['Ġ', 'h', 'o', 'w'],\n",
       " 'Ġthey': ['Ġ', 't', 'h', 'e', 'y'],\n",
       " 'Ġare': ['Ġ', 'a', 'r', 'e'],\n",
       " 'Ġtrained': ['Ġ', 't', 'r', 'a', 'i', 'n', 'e', 'd'],\n",
       " 'Ġand': ['Ġ', 'a', 'n', 'd'],\n",
       " 'Ġgenerate': ['Ġ', 'g', 'e', 'n', 'e', 'r', 'a', 't', 'e'],\n",
       " 'Ġtokens': ['Ġ', 't', 'o', 'k', 'e', 'n', 's']}"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "splits = {word: [c for c in word] for word in word_freqs.keys()}\n",
    "splits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "8bc3aaaf",
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_pair_freqs(splits):\n",
    "    pair_freqs = defaultdict(int)\n",
    "    for word, freq in word_freqs.items():\n",
    "        split = splits[word]\n",
    "        if len(split) == 1:\n",
    "            continue\n",
    "        for i in range(len(split) - 1):\n",
    "            pair = (split[i], split[i + 1])\n",
    "            pair_freqs[pair] += freq\n",
    "    return pair_freqs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "bf4729a6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('T', 'h'): 3\n",
      "('h', 'i'): 3\n",
      "('i', 's'): 5\n",
      "('Ġ', 'i'): 2\n",
      "('Ġ', 't'): 7\n",
      "('t', 'h'): 3\n"
     ]
    }
   ],
   "source": [
    "pair_freqs = compute_pair_freqs(splits)\n",
    "\n",
    "for i, key in enumerate(pair_freqs.keys()):\n",
    "    print(f\"{key}: {pair_freqs[key]}\")\n",
    "    if i >= 5:\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "b728166b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('Ġ', 't') 7\n"
     ]
    }
   ],
   "source": [
    "best_pair = \"\"\n",
    "max_freq = None\n",
    "\n",
    "for pair, freq in pair_freqs.items():\n",
    "    if max_freq is None or max_freq < freq:\n",
    "        best_pair = pair\n",
    "        max_freq = freq\n",
    "\n",
    "print(best_pair, max_freq)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "1a5c67f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def merge_pair(a, b, splits):\n",
    "    for word in word_freqs:\n",
    "        split = splits[word]\n",
    "        if len(split) == 1:\n",
    "            continue\n",
    "\n",
    "        i = 0\n",
    "        while i < len(split) - 1:\n",
    "            if split[i] == a and split[i + 1] == b:\n",
    "                split = split[:i] + [a + b] + split[i + 2 :]\n",
    "            else:\n",
    "                i += 1\n",
    "        splits[word] = split\n",
    "    return splits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "a246f6a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "merges = {(\"Ġ\", \"t\"): \"Ġt\"}\n",
    "vocab.append(\"Ġt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "622788eb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Ġt', 'r', 'a', 'i', 'n', 'e', 'd']\n"
     ]
    }
   ],
   "source": [
    "splits = merge_pair(\"Ġ\", \"t\", splits)\n",
    "print(splits[\"Ġtrained\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "6d857449",
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab_size = 50\n",
    "\n",
    "while len(vocab) < vocab_size:\n",
    "    pair_freqs = compute_pair_freqs(splits)\n",
    "    best_pair = \"\"\n",
    "    max_freq = None\n",
    "    for pair, freq in pair_freqs.items():\n",
    "        if max_freq is None or max_freq < freq:\n",
    "            best_pair = pair\n",
    "            max_freq = freq\n",
    "    splits = merge_pair(*best_pair, splits)\n",
    "    merges[best_pair] = best_pair[0] + best_pair[1]\n",
    "    vocab.append(best_pair[0] + best_pair[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "5bc2dcfd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{('Ġ', 't'): 'Ġt',\n",
       " ('i', 's'): 'is',\n",
       " ('e', 'r'): 'er',\n",
       " ('Ġ', 'a'): 'Ġa',\n",
       " ('Ġt', 'o'): 'Ġto',\n",
       " ('e', 'n'): 'en',\n",
       " ('T', 'h'): 'Th',\n",
       " ('Th', 'is'): 'This',\n",
       " ('o', 'u'): 'ou',\n",
       " ('s', 'e'): 'se',\n",
       " ('Ġto', 'k'): 'Ġtok',\n",
       " ('Ġtok', 'en'): 'Ġtoken',\n",
       " ('n', 'd'): 'nd',\n",
       " ('Ġ', 'is'): 'Ġis',\n",
       " ('Ġt', 'h'): 'Ġth',\n",
       " ('Ġth', 'e'): 'Ġthe',\n",
       " ('i', 'n'): 'in',\n",
       " ('Ġa', 'b'): 'Ġab',\n",
       " ('Ġtoken', 'i'): 'Ġtokeni'}"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merges"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "44b48140",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['<|endoftext|>',\n",
       " ',',\n",
       " '.',\n",
       " 'C',\n",
       " 'F',\n",
       " 'H',\n",
       " 'T',\n",
       " 'a',\n",
       " 'b',\n",
       " 'c',\n",
       " 'd',\n",
       " 'e',\n",
       " 'f',\n",
       " 'g',\n",
       " 'h',\n",
       " 'i',\n",
       " 'k',\n",
       " 'l',\n",
       " 'm',\n",
       " 'n',\n",
       " 'o',\n",
       " 'p',\n",
       " 'r',\n",
       " 's',\n",
       " 't',\n",
       " 'u',\n",
       " 'v',\n",
       " 'w',\n",
       " 'y',\n",
       " 'z',\n",
       " 'Ġ',\n",
       " 'Ġt',\n",
       " 'is',\n",
       " 'er',\n",
       " 'Ġa',\n",
       " 'Ġto',\n",
       " 'en',\n",
       " 'Th',\n",
       " 'This',\n",
       " 'ou',\n",
       " 'se',\n",
       " 'Ġtok',\n",
       " 'Ġtoken',\n",
       " 'nd',\n",
       " 'Ġis',\n",
       " 'Ġth',\n",
       " 'Ġthe',\n",
       " 'in',\n",
       " 'Ġab',\n",
       " 'Ġtokeni']"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "cf629b1a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize(text):\n",
    "    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)\n",
    "    pre_tokenized_text = [word for word, offset in pre_tokenize_result]\n",
    "    splits = [[l for l in word] for word in pre_tokenized_text]\n",
    "    for pair, merge in merges.items():\n",
    "        for idx, split in enumerate(splits):\n",
    "            i = 0\n",
    "            while i < len(split) - 1:\n",
    "                if split[i] == pair[0] and split[i + 1] == pair[1]:\n",
    "                    split = split[:i] + [merge] + split[i + 2 :]\n",
    "                else:\n",
    "                    i += 1\n",
    "            splits[idx] = split\n",
    "\n",
    "    return sum(splits, [])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "e3bea0aa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['This', 'Ġis', 'Ġ', 'n', 'o', 't', 'Ġa', 'Ġtoken', '.']"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenize(\"This is not a token.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f7a1455e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "4d0db2a2",
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"2023-08-30 10:13:27.900353: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
	"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
	]
	},
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "4424ba5c3b5442f2932858d518ded7a5",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"Downloading: 0%\| \| 0.00/665 [00:00<?, ?B/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "a380316b33674902878f918db676cbbf",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"Downloading: 0%\| \| 0.00/1.04M [00:00<?, ?B/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "906fe17299b147da946166486d4f6f63",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"Downloading: 0%\| \| 0.00/456k [00:00<?, ?B/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "601bc6160bdd447882ec2769dafa6606",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"Downloading: 0%\| \| 0.00/1.36M [00:00<?, ?B/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"from transformers import AutoTokenizer\n",
	"\n",
	"tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "7d429e99",
	"metadata": {},
	"outputs": [],
	"source": [
	"corpus = [\n",
	" \"This is the Hugging Face Course.\",\n",
	" \"This chapter is about tokenization.\",\n",
	" \"This section shows several tokenizer algorithbms.\",\n",
	" \"Hopefully, you will be able to understand how they are trained and generate tokens.\",\n",
	"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"id": "862da2c5",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('Gunnar', (0, 6)),\n",
	" ('Ġ', (6, 7)),\n",
	" ('Ġeats', (7, 12)),\n",
	" ('Ċ', (12, 13)),\n",
	" ('cake', (13, 17)),\n",
	" ('.', (17, 18))]"
	]
	},
	"execution_count": 21,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(\"Gunnar eats\\ncake.\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "be1a4896",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"defaultdict(<class 'int'>, {'This': 3, 'Ġis': 2, 'Ġthe': 1, 'ĠHugging': 1, 'ĠFace': 1, 'ĠCourse': 1, '.': 4, 'Ġchapter': 1, 'Ġabout': 1, 'Ġtokenization': 1, 'Ġsection': 1, 'Ġshows': 1, 'Ġseveral': 1, 'Ġtokenizer': 1, 'Ġalgorithms': 1, 'Hopefully': 1, ',': 1, 'Ġyou': 1, 'Ġwill': 1, 'Ġbe': 1, 'Ġable': 1, 'Ġto': 1, 'Ġunderstand': 1, 'Ġhow': 1, 'Ġthey': 1, 'Ġare': 1, 'Ġtrained': 1, 'Ġand': 1, 'Ġgenerate': 1, 'Ġtokens': 1})\n"
	]
	}
	],
	"source": [
	"from collections import defaultdict\n",
	"\n",
	"word_freqs = defaultdict(int)\n",
	"\n",
	"for text in corpus:\n",
	" words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)\n",
	" new_words = [word for word, offset in words_with_offsets]\n",
	" for word in new_words:\n",
	" word_freqs[word] += 1\n",
	"\n",
	"print(word_freqs)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "263b8335",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ']\n"
	]
	}
	],
	"source": [
	"alphabet = []\n",
	"\n",
	"for word in word_freqs.keys():\n",
	" for letter in word:\n",
	" if letter not in alphabet:\n",
	" alphabet.append(letter)\n",
	"alphabet.sort()\n",
	"\n",
	"print(alphabet)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"id": "39678c7c",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"31"
	]
	},
	"execution_count": 26,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"vocab = [\"<\|endoftext\|>\"] + alphabet.copy()\n",
	"len(vocab)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"id": "16c8ecf2",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'This': ['T', 'h', 'i', 's'],\n",
	" 'Ġis': ['Ġ', 'i', 's'],\n",
	" 'Ġthe': ['Ġ', 't', 'h', 'e'],\n",
	" 'ĠHugging': ['Ġ', 'H', 'u', 'g', 'g', 'i', 'n', 'g'],\n",
	" 'ĠFace': ['Ġ', 'F', 'a', 'c', 'e'],\n",
	" 'ĠCourse': ['Ġ', 'C', 'o', 'u', 'r', 's', 'e'],\n",
	" '.': ['.'],\n",
	" 'Ġchapter': ['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r'],\n",
	" 'Ġabout': ['Ġ', 'a', 'b', 'o', 'u', 't'],\n",
	" 'Ġtokenization': ['Ġ',\n",
	" 't',\n",
	" 'o',\n",
	" 'k',\n",
	" 'e',\n",
	" 'n',\n",
	" 'i',\n",
	" 'z',\n",
	" 'a',\n",
	" 't',\n",
	" 'i',\n",
	" 'o',\n",
	" 'n'],\n",
	" 'Ġsection': ['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n'],\n",
	" 'Ġshows': ['Ġ', 's', 'h', 'o', 'w', 's'],\n",
	" 'Ġseveral': ['Ġ', 's', 'e', 'v', 'e', 'r', 'a', 'l'],\n",
	" 'Ġtokenizer': ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r'],\n",
	" 'Ġalgorithms': ['Ġ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's'],\n",
	" 'Hopefully': ['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y'],\n",
	" ',': [','],\n",
	" 'Ġyou': ['Ġ', 'y', 'o', 'u'],\n",
	" 'Ġwill': ['Ġ', 'w', 'i', 'l', 'l'],\n",
	" 'Ġbe': ['Ġ', 'b', 'e'],\n",
	" 'Ġable': ['Ġ', 'a', 'b', 'l', 'e'],\n",
	" 'Ġto': ['Ġ', 't', 'o'],\n",
	" 'Ġunderstand': ['Ġ', 'u', 'n', 'd', 'e', 'r', 's', 't', 'a', 'n', 'd'],\n",
	" 'Ġhow': ['Ġ', 'h', 'o', 'w'],\n",
	" 'Ġthey': ['Ġ', 't', 'h', 'e', 'y'],\n",
	" 'Ġare': ['Ġ', 'a', 'r', 'e'],\n",
	" 'Ġtrained': ['Ġ', 't', 'r', 'a', 'i', 'n', 'e', 'd'],\n",
	" 'Ġand': ['Ġ', 'a', 'n', 'd'],\n",
	" 'Ġgenerate': ['Ġ', 'g', 'e', 'n', 'e', 'r', 'a', 't', 'e'],\n",
	" 'Ġtokens': ['Ġ', 't', 'o', 'k', 'e', 'n', 's']}"
	]
	},
	"execution_count": 22,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"splits = {word: [c for c in word] for word in word_freqs.keys()}\n",
	"splits"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"id": "8bc3aaaf",
	"metadata": {},
	"outputs": [],
	"source": [
	"def compute_pair_freqs(splits):\n",
	" pair_freqs = defaultdict(int)\n",
	" for word, freq in word_freqs.items():\n",
	" split = splits[word]\n",
	" if len(split) == 1:\n",
	" continue\n",
	" for i in range(len(split) - 1):\n",
	" pair = (split[i], split[i + 1])\n",
	" pair_freqs[pair] += freq\n",
	" return pair_freqs"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"id": "bf4729a6",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"('T', 'h'): 3\n",
	"('h', 'i'): 3\n",
	"('i', 's'): 5\n",
	"('Ġ', 'i'): 2\n",
	"('Ġ', 't'): 7\n",
	"('t', 'h'): 3\n"
	]
	}
	],
	"source": [
	"pair_freqs = compute_pair_freqs(splits)\n",
	"\n",
	"for i, key in enumerate(pair_freqs.keys()):\n",
	" print(f\"{key}: {pair_freqs[key]}\")\n",
	" if i >= 5:\n",
	" break"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"id": "b728166b",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"('Ġ', 't') 7\n"
	]
	}
	],
	"source": [
	"best_pair = \"\"\n",
	"max_freq = None\n",
	"\n",
	"for pair, freq in pair_freqs.items():\n",
	" if max_freq is None or max_freq < freq:\n",
	" best_pair = pair\n",
	" max_freq = freq\n",
	"\n",
	"print(best_pair, max_freq)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"id": "1a5c67f7",
	"metadata": {},
	"outputs": [],
	"source": [
	"def merge_pair(a, b, splits):\n",
	" for word in word_freqs:\n",
	" split = splits[word]\n",
	" if len(split) == 1:\n",
	" continue\n",
	"\n",
	" i = 0\n",
	" while i < len(split) - 1:\n",
	" if split[i] == a and split[i + 1] == b:\n",
	" split = split[:i] + [a + b] + split[i + 2 :]\n",
	" else:\n",
	" i += 1\n",
	" splits[word] = split\n",
	" return splits"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"id": "a246f6a5",
	"metadata": {},
	"outputs": [],
	"source": [
	"merges = {(\"Ġ\", \"t\"): \"Ġt\"}\n",
	"vocab.append(\"Ġt\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"id": "622788eb",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"['Ġt', 'r', 'a', 'i', 'n', 'e', 'd']\n"
	]
	}
	],
	"source": [
	"splits = merge_pair(\"Ġ\", \"t\", splits)\n",
	"print(splits[\"Ġtrained\"])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"id": "6d857449",
	"metadata": {},
	"outputs": [],
	"source": [
	"vocab_size = 50\n",
	"\n",
	"while len(vocab) < vocab_size:\n",
	" pair_freqs = compute_pair_freqs(splits)\n",
	" best_pair = \"\"\n",
	" max_freq = None\n",
	" for pair, freq in pair_freqs.items():\n",
	" if max_freq is None or max_freq < freq:\n",
	" best_pair = pair\n",
	" max_freq = freq\n",
	" splits = merge_pair(*best_pair, splits)\n",
	" merges[best_pair] = best_pair[0] + best_pair[1]\n",
	" vocab.append(best_pair[0] + best_pair[1])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"id": "5bc2dcfd",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{('Ġ', 't'): 'Ġt',\n",
	" ('i', 's'): 'is',\n",
	" ('e', 'r'): 'er',\n",
	" ('Ġ', 'a'): 'Ġa',\n",
	" ('Ġt', 'o'): 'Ġto',\n",
	" ('e', 'n'): 'en',\n",
	" ('T', 'h'): 'Th',\n",
	" ('Th', 'is'): 'This',\n",
	" ('o', 'u'): 'ou',\n",
	" ('s', 'e'): 'se',\n",
	" ('Ġto', 'k'): 'Ġtok',\n",
	" ('Ġtok', 'en'): 'Ġtoken',\n",
	" ('n', 'd'): 'nd',\n",
	" ('Ġ', 'is'): 'Ġis',\n",
	" ('Ġt', 'h'): 'Ġth',\n",
	" ('Ġth', 'e'): 'Ġthe',\n",
	" ('i', 'n'): 'in',\n",
	" ('Ġa', 'b'): 'Ġab',\n",
	" ('Ġtoken', 'i'): 'Ġtokeni'}"
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"merges"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"id": "44b48140",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['<\|endoftext\|>',\n",
	" ',',\n",
	" '.',\n",
	" 'C',\n",
	" 'F',\n",
	" 'H',\n",
	" 'T',\n",
	" 'a',\n",
	" 'b',\n",
	" 'c',\n",
	" 'd',\n",
	" 'e',\n",
	" 'f',\n",
	" 'g',\n",
	" 'h',\n",
	" 'i',\n",
	" 'k',\n",
	" 'l',\n",
	" 'm',\n",
	" 'n',\n",
	" 'o',\n",
	" 'p',\n",
	" 'r',\n",
	" 's',\n",
	" 't',\n",
	" 'u',\n",
	" 'v',\n",
	" 'w',\n",
	" 'y',\n",
	" 'z',\n",
	" 'Ġ',\n",
	" 'Ġt',\n",
	" 'is',\n",
	" 'er',\n",
	" 'Ġa',\n",
	" 'Ġto',\n",
	" 'en',\n",
	" 'Th',\n",
	" 'This',\n",
	" 'ou',\n",
	" 'se',\n",
	" 'Ġtok',\n",
	" 'Ġtoken',\n",
	" 'nd',\n",
	" 'Ġis',\n",
	" 'Ġth',\n",
	" 'Ġthe',\n",
	" 'in',\n",
	" 'Ġab',\n",
	" 'Ġtokeni']"
	]
	},
	"execution_count": 16,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"vocab"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"id": "cf629b1a",
	"metadata": {},
	"outputs": [],
	"source": [
	"def tokenize(text):\n",
	" pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)\n",
	" pre_tokenized_text = [word for word, offset in pre_tokenize_result]\n",
	" splits = [[l for l in word] for word in pre_tokenized_text]\n",
	" for pair, merge in merges.items():\n",
	" for idx, split in enumerate(splits):\n",
	" i = 0\n",
	" while i < len(split) - 1:\n",
	" if split[i] == pair[0] and split[i + 1] == pair[1]:\n",
	" split = split[:i] + [merge] + split[i + 2 :]\n",
	" else:\n",
	" i += 1\n",
	" splits[idx] = split\n",
	"\n",
	" return sum(splits, [])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"id": "e3bea0aa",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['This', 'Ġis', 'Ġ', 'n', 'o', 't', 'Ġa', 'Ġtoken', '.']"
	]
	},
	"execution_count": 18,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"tokenize(\"This is not a token.\")\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "f7a1455e",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}