gromgull/Unigram.ipynb

## Unigram.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1ddf9f86",
   "metadata": {},
   "outputs": [],
   "source": [
    "corpus = [\n",
    "    \"This is the Hugging Face Course.\",\n",
    "    \"This chapter is about tokenization.\",\n",
    "    \"This section shows several tokenizer algorithms.\",\n",
    "    \"Hopefully, you will be able to understand how they are trained and generate tokens.\",\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "eab45427",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-08-30 10:25:33.079067: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "50f14aa634744fa7a997fd55af6d0cc5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "21c4921369f84c14b187b9bfec36d933",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "484f586a5acb4d7da04d7a3f10cc99c1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/1.38M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"xlnet-base-cased\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "534bc241",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defaultdict(int,\n",
       "            {'▁This': 3,\n",
       "             '▁is': 2,\n",
       "             '▁the': 1,\n",
       "             '▁Hugging': 1,\n",
       "             '▁Face': 1,\n",
       "             '▁Course.': 1,\n",
       "             '▁chapter': 1,\n",
       "             '▁about': 1,\n",
       "             '▁tokenization.': 1,\n",
       "             '▁section': 1,\n",
       "             '▁shows': 1,\n",
       "             '▁several': 1,\n",
       "             '▁tokenizer': 1,\n",
       "             '▁algorithms.': 1,\n",
       "             '▁Hopefully,': 1,\n",
       "             '▁you': 1,\n",
       "             '▁will': 1,\n",
       "             '▁be': 1,\n",
       "             '▁able': 1,\n",
       "             '▁to': 1,\n",
       "             '▁understand': 1,\n",
       "             '▁how': 1,\n",
       "             '▁they': 1,\n",
       "             '▁are': 1,\n",
       "             '▁trained': 1,\n",
       "             '▁and': 1,\n",
       "             '▁generate': 1,\n",
       "             '▁tokens.': 1})"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from collections import defaultdict\n",
    "\n",
    "word_freqs = defaultdict(int)\n",
    "for text in corpus:\n",
    "    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)\n",
    "    new_words = [word for word, offset in words_with_offsets]\n",
    "    for word in new_words:\n",
    "        word_freqs[word] += 1\n",
    "\n",
    "word_freqs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "328b80c2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('▁t', 7),\n",
       " ('is', 5),\n",
       " ('er', 5),\n",
       " ('▁a', 5),\n",
       " ('▁to', 4),\n",
       " ('to', 4),\n",
       " ('en', 4),\n",
       " ('▁T', 3),\n",
       " ('▁Th', 3),\n",
       " ('▁Thi', 3)]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "char_freqs = defaultdict(int)\n",
    "subwords_freqs = defaultdict(int)\n",
    "for word, freq in word_freqs.items():\n",
    "    for i in range(len(word)):\n",
    "        char_freqs[word[i]] += freq\n",
    "        # Loop through the subwords of length at least 2\n",
    "        for j in range(i + 2, len(word) + 1):\n",
    "            subwords_freqs[word[i:j]] += freq\n",
    "\n",
    "# Sort subwords by frequency\n",
    "sorted_subwords = sorted(subwords_freqs.items(), key=lambda x: x[1], reverse=True)\n",
    "sorted_subwords[:10]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "d70bb8d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "token_freqs = list(char_freqs.items()) + sorted_subwords[: 300 - len(char_freqs)]\n",
    "token_freqs = {token: freq for token, freq in token_freqs}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "ec931454",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'▁': 31,\n",
       " 'T': 3,\n",
       " 'h': 9,\n",
       " 'i': 13,\n",
       " 's': 13,\n",
       " 't': 14,\n",
       " 'e': 21,\n",
       " 'H': 2,\n",
       " 'u': 6,\n",
       " 'g': 5,\n",
       " 'n': 11,\n",
       " 'F': 1,\n",
       " 'a': 12,\n",
       " 'c': 3,\n",
       " 'C': 1,\n",
       " 'o': 13,\n",
       " 'r': 9,\n",
       " '.': 4,\n",
       " 'p': 2,\n",
       " 'b': 3,\n",
       " 'k': 3,\n",
       " 'z': 2,\n",
       " 'w': 3,\n",
       " 'v': 1,\n",
       " 'l': 7,\n",
       " 'm': 1,\n",
       " 'f': 1,\n",
       " 'y': 3,\n",
       " ',': 1,\n",
       " 'd': 4,\n",
       " '▁t': 7,\n",
       " 'is': 5,\n",
       " 'er': 5,\n",
       " '▁a': 5,\n",
       " '▁to': 4,\n",
       " 'to': 4,\n",
       " 'en': 4,\n",
       " '▁T': 3,\n",
       " '▁Th': 3,\n",
       " '▁Thi': 3,\n",
       " '▁This': 3,\n",
       " 'Th': 3,\n",
       " 'Thi': 3,\n",
       " 'This': 3,\n",
       " 'hi': 3,\n",
       " 'his': 3,\n",
       " 'th': 3,\n",
       " 'ou': 3,\n",
       " 'se': 3,\n",
       " '▁tok': 3,\n",
       " '▁toke': 3,\n",
       " '▁token': 3,\n",
       " 'tok': 3,\n",
       " 'toke': 3,\n",
       " 'token': 3,\n",
       " 'ok': 3,\n",
       " 'oke': 3,\n",
       " 'oken': 3,\n",
       " 'ke': 3,\n",
       " 'ken': 3,\n",
       " '▁s': 3,\n",
       " 'ra': 3,\n",
       " 'nd': 3,\n",
       " '▁i': 2,\n",
       " '▁is': 2,\n",
       " '▁th': 2,\n",
       " '▁the': 2,\n",
       " 'the': 2,\n",
       " 'he': 2,\n",
       " '▁H': 2,\n",
       " 'in': 2,\n",
       " 'rs': 2,\n",
       " 'te': 2,\n",
       " '▁ab': 2,\n",
       " 'ab': 2,\n",
       " '▁tokeni': 2,\n",
       " '▁tokeniz': 2,\n",
       " 'tokeni': 2,\n",
       " 'tokeniz': 2,\n",
       " 'okeni': 2,\n",
       " 'okeniz': 2,\n",
       " 'keni': 2,\n",
       " 'keniz': 2,\n",
       " 'eni': 2,\n",
       " 'eniz': 2,\n",
       " 'ni': 2,\n",
       " 'niz': 2,\n",
       " 'iz': 2,\n",
       " 'at': 2,\n",
       " 'ti': 2,\n",
       " 'tio': 2,\n",
       " 'tion': 2,\n",
       " 'io': 2,\n",
       " 'ion': 2,\n",
       " 'on': 2,\n",
       " '▁se': 2,\n",
       " 'ho': 2,\n",
       " 'how': 2,\n",
       " 'ow': 2,\n",
       " 'era': 2,\n",
       " 'al': 2,\n",
       " 's.': 2,\n",
       " 'll': 2,\n",
       " 'an': 2,\n",
       " 'and': 2,\n",
       " 'ne': 2,\n",
       " '▁Hu': 1,\n",
       " '▁Hug': 1,\n",
       " '▁Hugg': 1,\n",
       " '▁Huggi': 1,\n",
       " '▁Huggin': 1,\n",
       " '▁Hugging': 1,\n",
       " 'Hu': 1,\n",
       " 'Hug': 1,\n",
       " 'Hugg': 1,\n",
       " 'Huggi': 1,\n",
       " 'Huggin': 1,\n",
       " 'Hugging': 1,\n",
       " 'ug': 1,\n",
       " 'ugg': 1,\n",
       " 'uggi': 1,\n",
       " 'uggin': 1,\n",
       " 'ugging': 1,\n",
       " 'gg': 1,\n",
       " 'ggi': 1,\n",
       " 'ggin': 1,\n",
       " 'gging': 1,\n",
       " 'gi': 1,\n",
       " 'gin': 1,\n",
       " 'ging': 1,\n",
       " 'ing': 1,\n",
       " 'ng': 1,\n",
       " '▁F': 1,\n",
       " '▁Fa': 1,\n",
       " '▁Fac': 1,\n",
       " '▁Face': 1,\n",
       " 'Fa': 1,\n",
       " 'Fac': 1,\n",
       " 'Face': 1,\n",
       " 'ac': 1,\n",
       " 'ace': 1,\n",
       " 'ce': 1,\n",
       " '▁C': 1,\n",
       " '▁Co': 1,\n",
       " '▁Cou': 1,\n",
       " '▁Cour': 1,\n",
       " '▁Cours': 1,\n",
       " '▁Course': 1,\n",
       " '▁Course.': 1,\n",
       " 'Co': 1,\n",
       " 'Cou': 1,\n",
       " 'Cour': 1,\n",
       " 'Cours': 1,\n",
       " 'Course': 1,\n",
       " 'Course.': 1,\n",
       " 'our': 1,\n",
       " 'ours': 1,\n",
       " 'ourse': 1,\n",
       " 'ourse.': 1,\n",
       " 'ur': 1,\n",
       " 'urs': 1,\n",
       " 'urse': 1,\n",
       " 'urse.': 1,\n",
       " 'rse': 1,\n",
       " 'rse.': 1,\n",
       " 'se.': 1,\n",
       " 'e.': 1,\n",
       " '▁c': 1,\n",
       " '▁ch': 1,\n",
       " '▁cha': 1,\n",
       " '▁chap': 1,\n",
       " '▁chapt': 1,\n",
       " '▁chapte': 1,\n",
       " '▁chapter': 1,\n",
       " 'ch': 1,\n",
       " 'cha': 1,\n",
       " 'chap': 1,\n",
       " 'chapt': 1,\n",
       " 'chapte': 1,\n",
       " 'chapter': 1,\n",
       " 'ha': 1,\n",
       " 'hap': 1,\n",
       " 'hapt': 1,\n",
       " 'hapte': 1,\n",
       " 'hapter': 1,\n",
       " 'ap': 1,\n",
       " 'apt': 1,\n",
       " 'apte': 1,\n",
       " 'apter': 1,\n",
       " 'pt': 1,\n",
       " 'pte': 1,\n",
       " 'pter': 1,\n",
       " 'ter': 1,\n",
       " '▁abo': 1,\n",
       " '▁abou': 1,\n",
       " '▁about': 1,\n",
       " 'abo': 1,\n",
       " 'abou': 1,\n",
       " 'about': 1,\n",
       " 'bo': 1,\n",
       " 'bou': 1,\n",
       " 'bout': 1,\n",
       " 'out': 1,\n",
       " 'ut': 1,\n",
       " '▁tokeniza': 1,\n",
       " '▁tokenizat': 1,\n",
       " '▁tokenizati': 1,\n",
       " '▁tokenizatio': 1,\n",
       " '▁tokenization': 1,\n",
       " '▁tokenization.': 1,\n",
       " 'tokeniza': 1,\n",
       " 'tokenizat': 1,\n",
       " 'tokenizati': 1,\n",
       " 'tokenizatio': 1,\n",
       " 'tokenization': 1,\n",
       " 'tokenization.': 1,\n",
       " 'okeniza': 1,\n",
       " 'okenizat': 1,\n",
       " 'okenizati': 1,\n",
       " 'okenizatio': 1,\n",
       " 'okenization': 1,\n",
       " 'okenization.': 1,\n",
       " 'keniza': 1,\n",
       " 'kenizat': 1,\n",
       " 'kenizati': 1,\n",
       " 'kenizatio': 1,\n",
       " 'kenization': 1,\n",
       " 'kenization.': 1,\n",
       " 'eniza': 1,\n",
       " 'enizat': 1,\n",
       " 'enizati': 1,\n",
       " 'enizatio': 1,\n",
       " 'enization': 1,\n",
       " 'enization.': 1,\n",
       " 'niza': 1,\n",
       " 'nizat': 1,\n",
       " 'nizati': 1,\n",
       " 'nizatio': 1,\n",
       " 'nization': 1,\n",
       " 'nization.': 1,\n",
       " 'iza': 1,\n",
       " 'izat': 1,\n",
       " 'izati': 1,\n",
       " 'izatio': 1,\n",
       " 'ization': 1,\n",
       " 'ization.': 1,\n",
       " 'za': 1,\n",
       " 'zat': 1,\n",
       " 'zati': 1,\n",
       " 'zatio': 1,\n",
       " 'zation': 1,\n",
       " 'zation.': 1,\n",
       " 'ati': 1,\n",
       " 'atio': 1,\n",
       " 'ation': 1,\n",
       " 'ation.': 1,\n",
       " 'tion.': 1,\n",
       " 'ion.': 1,\n",
       " 'on.': 1,\n",
       " 'n.': 1,\n",
       " '▁sec': 1,\n",
       " '▁sect': 1,\n",
       " '▁secti': 1,\n",
       " '▁sectio': 1,\n",
       " '▁section': 1,\n",
       " 'sec': 1,\n",
       " 'sect': 1,\n",
       " 'secti': 1,\n",
       " 'sectio': 1,\n",
       " 'section': 1,\n",
       " 'ec': 1,\n",
       " 'ect': 1,\n",
       " 'ecti': 1,\n",
       " 'ectio': 1,\n",
       " 'ection': 1,\n",
       " 'ct': 1,\n",
       " 'cti': 1,\n",
       " 'ctio': 1,\n",
       " 'ction': 1,\n",
       " '▁sh': 1,\n",
       " '▁sho': 1,\n",
       " '▁show': 1,\n",
       " '▁shows': 1,\n",
       " 'sh': 1,\n",
       " 'sho': 1,\n",
       " 'show': 1,\n",
       " 'shows': 1,\n",
       " 'hows': 1,\n",
       " 'ows': 1,\n",
       " 'ws': 1,\n",
       " '▁sev': 1,\n",
       " '▁seve': 1,\n",
       " '▁sever': 1,\n",
       " '▁severa': 1,\n",
       " '▁several': 1,\n",
       " 'sev': 1,\n",
       " 'seve': 1,\n",
       " 'sever': 1,\n",
       " 'severa': 1,\n",
       " 'several': 1}"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "token_freqs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "c8622b4f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from math import log\n",
    "\n",
    "total_sum = sum([freq for token, freq in token_freqs.items()])\n",
    "model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "65abfe44",
   "metadata": {},
   "outputs": [],
   "source": [
    "def encode_word(word, model):\n",
    "    best_segmentations = [{\"start\": 0, \"score\": 1}] + [\n",
    "        {\"start\": None, \"score\": None} for _ in range(len(word))\n",
    "    ]\n",
    "    for start_idx in range(len(word)):\n",
    "        # This should be properly filled by the previous steps of the loop\n",
    "        best_score_at_start = best_segmentations[start_idx][\"score\"]\n",
    "        for end_idx in range(start_idx + 1, len(word) + 1):\n",
    "            token = word[start_idx:end_idx]\n",
    "            if token in model and best_score_at_start is not None:\n",
    "                score = model[token] + best_score_at_start\n",
    "                # If we have found a better segmentation ending at end_idx, we update\n",
    "                if (\n",
    "                    best_segmentations[end_idx][\"score\"] is None\n",
    "                    or best_segmentations[end_idx][\"score\"] > score\n",
    "                ):\n",
    "                    best_segmentations[end_idx] = {\"start\": start_idx, \"score\": score}\n",
    "\n",
    "    segmentation = best_segmentations[-1]\n",
    "    if segmentation[\"score\"] is None:\n",
    "        # We did not find a tokenization of the word -> unknown\n",
    "        return [\"<unk>\"], None\n",
    "\n",
    "    score = segmentation[\"score\"]\n",
    "    start = segmentation[\"start\"]\n",
    "    end = len(word)\n",
    "    tokens = []\n",
    "    while start != 0:\n",
    "        tokens.insert(0, word[start:end])\n",
    "        next_start = best_segmentations[start][\"start\"]\n",
    "        end = start\n",
    "        start = next_start\n",
    "    tokens.insert(0, word[start:end])\n",
    "    return tokens, score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "b3f4d087",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(['H', 'o', 'p', 'e', 'f', 'u', 'll', 'y'], 41.5157494601402)\n",
      "(['This'], 6.288267030694535)\n"
     ]
    }
   ],
   "source": [
    "print(encode_word(\"Hopefully\", model))\n",
    "print(encode_word(\"This\", model))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "d0e6991d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_loss(model):\n",
    "    loss = 0\n",
    "    for word, freq in word_freqs.items():\n",
    "        _, word_loss = encode_word(word, model)\n",
    "        loss += freq * word_loss\n",
    "    return loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "8681ebbb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "413.10377642940875"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "compute_loss(model)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "2cd27f4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import copy\n",
    "\n",
    "\n",
    "def compute_scores(model):\n",
    "    scores = {}\n",
    "    model_loss = compute_loss(model)\n",
    "    for token, score in model.items():\n",
    "        # We always keep tokens of length 1\n",
    "        if len(token) == 1:\n",
    "            continue\n",
    "        model_without_token = copy.deepcopy(model)\n",
    "        _ = model_without_token.pop(token)\n",
    "        scores[token] = compute_loss(model_without_token) - model_loss\n",
    "    return scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "3a42bd3c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6.376412403623874\n",
      "0.0\n"
     ]
    }
   ],
   "source": [
    "scores = compute_scores(model)\n",
    "print(scores[\"ll\"])\n",
    "print(scores[\"his\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "53c09fbc",
   "metadata": {},
   "outputs": [],
   "source": [
    "percent_to_remove = 0.1\n",
    "while len(model) > 100:\n",
    "    scores = compute_scores(model)\n",
    "    sorted_scores = sorted(scores.items(), key=lambda x: x[1])\n",
    "    # Remove percent_to_remove tokens with the lowest scores.\n",
    "    for i in range(int(len(model) * percent_to_remove)):\n",
    "        _ = token_freqs.pop(sorted_scores[i][0])\n",
    "\n",
    "    total_sum = sum([freq for token, freq in token_freqs.items()])\n",
    "    model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "84633d43",
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize(text, model):\n",
    "    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)\n",
    "    pre_tokenized_text = [word for word, offset in words_with_offsets]\n",
    "    encoded_words = [encode_word(word, model)[0] for word in pre_tokenized_text]\n",
    "    return sum(encoded_words, [])\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "de95cf77",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['▁This',\n",
       " '▁is',\n",
       " '▁the',\n",
       " '▁Hugging',\n",
       " '▁Face',\n",
       " '▁',\n",
       " 'c',\n",
       " 'ou',\n",
       " 'r',\n",
       " 's',\n",
       " 'e',\n",
       " '.']"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenize(\"This is the Hugging Face course.\", model)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "03270dbe",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "1ddf9f86",
	"metadata": {},
	"outputs": [],
	"source": [
	"corpus = [\n",
	" \"This is the Hugging Face Course.\",\n",
	" \"This chapter is about tokenization.\",\n",
	" \"This section shows several tokenizer algorithms.\",\n",
	" \"Hopefully, you will be able to understand how they are trained and generate tokens.\",\n",
	"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "eab45427",
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"2023-08-30 10:25:33.079067: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
	"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
	]
	},
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "50f14aa634744fa7a997fd55af6d0cc5",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"Downloading: 0%\| \| 0.00/760 [00:00<?, ?B/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "21c4921369f84c14b187b9bfec36d933",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"Downloading: 0%\| \| 0.00/798k [00:00<?, ?B/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "484f586a5acb4d7da04d7a3f10cc99c1",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"Downloading: 0%\| \| 0.00/1.38M [00:00<?, ?B/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"from transformers import AutoTokenizer\n",
	"\n",
	"tokenizer = AutoTokenizer.from_pretrained(\"xlnet-base-cased\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "534bc241",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"defaultdict(int,\n",
	" {'▁This': 3,\n",
	" '▁is': 2,\n",
	" '▁the': 1,\n",
	" '▁Hugging': 1,\n",
	" '▁Face': 1,\n",
	" '▁Course.': 1,\n",
	" '▁chapter': 1,\n",
	" '▁about': 1,\n",
	" '▁tokenization.': 1,\n",
	" '▁section': 1,\n",
	" '▁shows': 1,\n",
	" '▁several': 1,\n",
	" '▁tokenizer': 1,\n",
	" '▁algorithms.': 1,\n",
	" '▁Hopefully,': 1,\n",
	" '▁you': 1,\n",
	" '▁will': 1,\n",
	" '▁be': 1,\n",
	" '▁able': 1,\n",
	" '▁to': 1,\n",
	" '▁understand': 1,\n",
	" '▁how': 1,\n",
	" '▁they': 1,\n",
	" '▁are': 1,\n",
	" '▁trained': 1,\n",
	" '▁and': 1,\n",
	" '▁generate': 1,\n",
	" '▁tokens.': 1})"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from collections import defaultdict\n",
	"\n",
	"word_freqs = defaultdict(int)\n",
	"for text in corpus:\n",
	" words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)\n",
	" new_words = [word for word, offset in words_with_offsets]\n",
	" for word in new_words:\n",
	" word_freqs[word] += 1\n",
	"\n",
	"word_freqs"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"id": "328b80c2",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('▁t', 7),\n",
	" ('is', 5),\n",
	" ('er', 5),\n",
	" ('▁a', 5),\n",
	" ('▁to', 4),\n",
	" ('to', 4),\n",
	" ('en', 4),\n",
	" ('▁T', 3),\n",
	" ('▁Th', 3),\n",
	" ('▁Thi', 3)]"
	]
	},
	"execution_count": 21,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"char_freqs = defaultdict(int)\n",
	"subwords_freqs = defaultdict(int)\n",
	"for word, freq in word_freqs.items():\n",
	" for i in range(len(word)):\n",
	" char_freqs[word[i]] += freq\n",
	" # Loop through the subwords of length at least 2\n",
	" for j in range(i + 2, len(word) + 1):\n",
	" subwords_freqs[word[i:j]] += freq\n",
	"\n",
	"# Sort subwords by frequency\n",
	"sorted_subwords = sorted(subwords_freqs.items(), key=lambda x: x[1], reverse=True)\n",
	"sorted_subwords[:10]\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"id": "d70bb8d1",
	"metadata": {},
	"outputs": [],
	"source": [
	"token_freqs = list(char_freqs.items()) + sorted_subwords[: 300 - len(char_freqs)]\n",
	"token_freqs = {token: freq for token, freq in token_freqs}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"id": "ec931454",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'▁': 31,\n",
	" 'T': 3,\n",
	" 'h': 9,\n",
	" 'i': 13,\n",
	" 's': 13,\n",
	" 't': 14,\n",
	" 'e': 21,\n",
	" 'H': 2,\n",
	" 'u': 6,\n",
	" 'g': 5,\n",
	" 'n': 11,\n",
	" 'F': 1,\n",
	" 'a': 12,\n",
	" 'c': 3,\n",
	" 'C': 1,\n",
	" 'o': 13,\n",
	" 'r': 9,\n",
	" '.': 4,\n",
	" 'p': 2,\n",
	" 'b': 3,\n",
	" 'k': 3,\n",
	" 'z': 2,\n",
	" 'w': 3,\n",
	" 'v': 1,\n",
	" 'l': 7,\n",
	" 'm': 1,\n",
	" 'f': 1,\n",
	" 'y': 3,\n",
	" ',': 1,\n",
	" 'd': 4,\n",
	" '▁t': 7,\n",
	" 'is': 5,\n",
	" 'er': 5,\n",
	" '▁a': 5,\n",
	" '▁to': 4,\n",
	" 'to': 4,\n",
	" 'en': 4,\n",
	" '▁T': 3,\n",
	" '▁Th': 3,\n",
	" '▁Thi': 3,\n",
	" '▁This': 3,\n",
	" 'Th': 3,\n",
	" 'Thi': 3,\n",
	" 'This': 3,\n",
	" 'hi': 3,\n",
	" 'his': 3,\n",
	" 'th': 3,\n",
	" 'ou': 3,\n",
	" 'se': 3,\n",
	" '▁tok': 3,\n",
	" '▁toke': 3,\n",
	" '▁token': 3,\n",
	" 'tok': 3,\n",
	" 'toke': 3,\n",
	" 'token': 3,\n",
	" 'ok': 3,\n",
	" 'oke': 3,\n",
	" 'oken': 3,\n",
	" 'ke': 3,\n",
	" 'ken': 3,\n",
	" '▁s': 3,\n",
	" 'ra': 3,\n",
	" 'nd': 3,\n",
	" '▁i': 2,\n",
	" '▁is': 2,\n",
	" '▁th': 2,\n",
	" '▁the': 2,\n",
	" 'the': 2,\n",
	" 'he': 2,\n",
	" '▁H': 2,\n",
	" 'in': 2,\n",
	" 'rs': 2,\n",
	" 'te': 2,\n",
	" '▁ab': 2,\n",
	" 'ab': 2,\n",
	" '▁tokeni': 2,\n",
	" '▁tokeniz': 2,\n",
	" 'tokeni': 2,\n",
	" 'tokeniz': 2,\n",
	" 'okeni': 2,\n",
	" 'okeniz': 2,\n",
	" 'keni': 2,\n",
	" 'keniz': 2,\n",
	" 'eni': 2,\n",
	" 'eniz': 2,\n",
	" 'ni': 2,\n",
	" 'niz': 2,\n",
	" 'iz': 2,\n",
	" 'at': 2,\n",
	" 'ti': 2,\n",
	" 'tio': 2,\n",
	" 'tion': 2,\n",
	" 'io': 2,\n",
	" 'ion': 2,\n",
	" 'on': 2,\n",
	" '▁se': 2,\n",
	" 'ho': 2,\n",
	" 'how': 2,\n",
	" 'ow': 2,\n",
	" 'era': 2,\n",
	" 'al': 2,\n",
	" 's.': 2,\n",
	" 'll': 2,\n",
	" 'an': 2,\n",
	" 'and': 2,\n",
	" 'ne': 2,\n",
	" '▁Hu': 1,\n",
	" '▁Hug': 1,\n",
	" '▁Hugg': 1,\n",
	" '▁Huggi': 1,\n",
	" '▁Huggin': 1,\n",
	" '▁Hugging': 1,\n",
	" 'Hu': 1,\n",
	" 'Hug': 1,\n",
	" 'Hugg': 1,\n",
	" 'Huggi': 1,\n",
	" 'Huggin': 1,\n",
	" 'Hugging': 1,\n",
	" 'ug': 1,\n",
	" 'ugg': 1,\n",
	" 'uggi': 1,\n",
	" 'uggin': 1,\n",
	" 'ugging': 1,\n",
	" 'gg': 1,\n",
	" 'ggi': 1,\n",
	" 'ggin': 1,\n",
	" 'gging': 1,\n",
	" 'gi': 1,\n",
	" 'gin': 1,\n",
	" 'ging': 1,\n",
	" 'ing': 1,\n",
	" 'ng': 1,\n",
	" '▁F': 1,\n",
	" '▁Fa': 1,\n",
	" '▁Fac': 1,\n",
	" '▁Face': 1,\n",
	" 'Fa': 1,\n",
	" 'Fac': 1,\n",
	" 'Face': 1,\n",
	" 'ac': 1,\n",
	" 'ace': 1,\n",
	" 'ce': 1,\n",
	" '▁C': 1,\n",
	" '▁Co': 1,\n",
	" '▁Cou': 1,\n",
	" '▁Cour': 1,\n",
	" '▁Cours': 1,\n",
	" '▁Course': 1,\n",
	" '▁Course.': 1,\n",
	" 'Co': 1,\n",
	" 'Cou': 1,\n",
	" 'Cour': 1,\n",
	" 'Cours': 1,\n",
	" 'Course': 1,\n",
	" 'Course.': 1,\n",
	" 'our': 1,\n",
	" 'ours': 1,\n",
	" 'ourse': 1,\n",
	" 'ourse.': 1,\n",
	" 'ur': 1,\n",
	" 'urs': 1,\n",
	" 'urse': 1,\n",
	" 'urse.': 1,\n",
	" 'rse': 1,\n",
	" 'rse.': 1,\n",
	" 'se.': 1,\n",
	" 'e.': 1,\n",
	" '▁c': 1,\n",
	" '▁ch': 1,\n",
	" '▁cha': 1,\n",
	" '▁chap': 1,\n",
	" '▁chapt': 1,\n",
	" '▁chapte': 1,\n",
	" '▁chapter': 1,\n",
	" 'ch': 1,\n",
	" 'cha': 1,\n",
	" 'chap': 1,\n",
	" 'chapt': 1,\n",
	" 'chapte': 1,\n",
	" 'chapter': 1,\n",
	" 'ha': 1,\n",
	" 'hap': 1,\n",
	" 'hapt': 1,\n",
	" 'hapte': 1,\n",
	" 'hapter': 1,\n",
	" 'ap': 1,\n",
	" 'apt': 1,\n",
	" 'apte': 1,\n",
	" 'apter': 1,\n",
	" 'pt': 1,\n",
	" 'pte': 1,\n",
	" 'pter': 1,\n",
	" 'ter': 1,\n",
	" '▁abo': 1,\n",
	" '▁abou': 1,\n",
	" '▁about': 1,\n",
	" 'abo': 1,\n",
	" 'abou': 1,\n",
	" 'about': 1,\n",
	" 'bo': 1,\n",
	" 'bou': 1,\n",
	" 'bout': 1,\n",
	" 'out': 1,\n",
	" 'ut': 1,\n",
	" '▁tokeniza': 1,\n",
	" '▁tokenizat': 1,\n",
	" '▁tokenizati': 1,\n",
	" '▁tokenizatio': 1,\n",
	" '▁tokenization': 1,\n",
	" '▁tokenization.': 1,\n",
	" 'tokeniza': 1,\n",
	" 'tokenizat': 1,\n",
	" 'tokenizati': 1,\n",
	" 'tokenizatio': 1,\n",
	" 'tokenization': 1,\n",
	" 'tokenization.': 1,\n",
	" 'okeniza': 1,\n",
	" 'okenizat': 1,\n",
	" 'okenizati': 1,\n",
	" 'okenizatio': 1,\n",
	" 'okenization': 1,\n",
	" 'okenization.': 1,\n",
	" 'keniza': 1,\n",
	" 'kenizat': 1,\n",
	" 'kenizati': 1,\n",
	" 'kenizatio': 1,\n",
	" 'kenization': 1,\n",
	" 'kenization.': 1,\n",
	" 'eniza': 1,\n",
	" 'enizat': 1,\n",
	" 'enizati': 1,\n",
	" 'enizatio': 1,\n",
	" 'enization': 1,\n",
	" 'enization.': 1,\n",
	" 'niza': 1,\n",
	" 'nizat': 1,\n",
	" 'nizati': 1,\n",
	" 'nizatio': 1,\n",
	" 'nization': 1,\n",
	" 'nization.': 1,\n",
	" 'iza': 1,\n",
	" 'izat': 1,\n",
	" 'izati': 1,\n",
	" 'izatio': 1,\n",
	" 'ization': 1,\n",
	" 'ization.': 1,\n",
	" 'za': 1,\n",
	" 'zat': 1,\n",
	" 'zati': 1,\n",
	" 'zatio': 1,\n",
	" 'zation': 1,\n",
	" 'zation.': 1,\n",
	" 'ati': 1,\n",
	" 'atio': 1,\n",
	" 'ation': 1,\n",
	" 'ation.': 1,\n",
	" 'tion.': 1,\n",
	" 'ion.': 1,\n",
	" 'on.': 1,\n",
	" 'n.': 1,\n",
	" '▁sec': 1,\n",
	" '▁sect': 1,\n",
	" '▁secti': 1,\n",
	" '▁sectio': 1,\n",
	" '▁section': 1,\n",
	" 'sec': 1,\n",
	" 'sect': 1,\n",
	" 'secti': 1,\n",
	" 'sectio': 1,\n",
	" 'section': 1,\n",
	" 'ec': 1,\n",
	" 'ect': 1,\n",
	" 'ecti': 1,\n",
	" 'ectio': 1,\n",
	" 'ection': 1,\n",
	" 'ct': 1,\n",
	" 'cti': 1,\n",
	" 'ctio': 1,\n",
	" 'ction': 1,\n",
	" '▁sh': 1,\n",
	" '▁sho': 1,\n",
	" '▁show': 1,\n",
	" '▁shows': 1,\n",
	" 'sh': 1,\n",
	" 'sho': 1,\n",
	" 'show': 1,\n",
	" 'shows': 1,\n",
	" 'hows': 1,\n",
	" 'ows': 1,\n",
	" 'ws': 1,\n",
	" '▁sev': 1,\n",
	" '▁seve': 1,\n",
	" '▁sever': 1,\n",
	" '▁severa': 1,\n",
	" '▁several': 1,\n",
	" 'sev': 1,\n",
	" 'seve': 1,\n",
	" 'sever': 1,\n",
	" 'severa': 1,\n",
	" 'several': 1}"
	]
	},
	"execution_count": 23,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"token_freqs"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "c8622b4f",
	"metadata": {},
	"outputs": [],
	"source": [
	"from math import log\n",
	"\n",
	"total_sum = sum([freq for token, freq in token_freqs.items()])\n",
	"model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "65abfe44",
	"metadata": {},
	"outputs": [],
	"source": [
	"def encode_word(word, model):\n",
	" best_segmentations = [{\"start\": 0, \"score\": 1}] + [\n",
	" {\"start\": None, \"score\": None} for _ in range(len(word))\n",
	" ]\n",
	" for start_idx in range(len(word)):\n",
	" # This should be properly filled by the previous steps of the loop\n",
	" best_score_at_start = best_segmentations[start_idx][\"score\"]\n",
	" for end_idx in range(start_idx + 1, len(word) + 1):\n",
	" token = word[start_idx:end_idx]\n",
	" if token in model and best_score_at_start is not None:\n",
	" score = model[token] + best_score_at_start\n",
	" # If we have found a better segmentation ending at end_idx, we update\n",
	" if (\n",
	" best_segmentations[end_idx][\"score\"] is None\n",
	" or best_segmentations[end_idx][\"score\"] > score\n",
	" ):\n",
	" best_segmentations[end_idx] = {\"start\": start_idx, \"score\": score}\n",
	"\n",
	" segmentation = best_segmentations[-1]\n",
	" if segmentation[\"score\"] is None:\n",
	" # We did not find a tokenization of the word -> unknown\n",
	" return [\"<unk>\"], None\n",
	"\n",
	" score = segmentation[\"score\"]\n",
	" start = segmentation[\"start\"]\n",
	" end = len(word)\n",
	" tokens = []\n",
	" while start != 0:\n",
	" tokens.insert(0, word[start:end])\n",
	" next_start = best_segmentations[start][\"start\"]\n",
	" end = start\n",
	" start = next_start\n",
	" tokens.insert(0, word[start:end])\n",
	" return tokens, score"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"id": "b3f4d087",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"(['H', 'o', 'p', 'e', 'f', 'u', 'll', 'y'], 41.5157494601402)\n",
	"(['This'], 6.288267030694535)\n"
	]
	}
	],
	"source": [
	"print(encode_word(\"Hopefully\", model))\n",
	"print(encode_word(\"This\", model))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"id": "d0e6991d",
	"metadata": {},
	"outputs": [],
	"source": [
	"def compute_loss(model):\n",
	" loss = 0\n",
	" for word, freq in word_freqs.items():\n",
	" _, word_loss = encode_word(word, model)\n",
	" loss += freq * word_loss\n",
	" return loss"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"id": "8681ebbb",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"413.10377642940875"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"compute_loss(model)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"id": "2cd27f4c",
	"metadata": {},
	"outputs": [],
	"source": [
	"import copy\n",
	"\n",
	"\n",
	"def compute_scores(model):\n",
	" scores = {}\n",
	" model_loss = compute_loss(model)\n",
	" for token, score in model.items():\n",
	" # We always keep tokens of length 1\n",
	" if len(token) == 1:\n",
	" continue\n",
	" model_without_token = copy.deepcopy(model)\n",
	" _ = model_without_token.pop(token)\n",
	" scores[token] = compute_loss(model_without_token) - model_loss\n",
	" return scores"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"id": "3a42bd3c",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"6.376412403623874\n",
	"0.0\n"
	]
	}
	],
	"source": [
	"scores = compute_scores(model)\n",
	"print(scores[\"ll\"])\n",
	"print(scores[\"his\"])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"id": "53c09fbc",
	"metadata": {},
	"outputs": [],
	"source": [
	"percent_to_remove = 0.1\n",
	"while len(model) > 100:\n",
	" scores = compute_scores(model)\n",
	" sorted_scores = sorted(scores.items(), key=lambda x: x[1])\n",
	" # Remove percent_to_remove tokens with the lowest scores.\n",
	" for i in range(int(len(model) * percent_to_remove)):\n",
	" _ = token_freqs.pop(sorted_scores[i][0])\n",
	"\n",
	" total_sum = sum([freq for token, freq in token_freqs.items()])\n",
	" model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"id": "84633d43",
	"metadata": {},
	"outputs": [],
	"source": [
	"def tokenize(text, model):\n",
	" words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)\n",
	" pre_tokenized_text = [word for word, offset in words_with_offsets]\n",
	" encoded_words = [encode_word(word, model)[0] for word in pre_tokenized_text]\n",
	" return sum(encoded_words, [])\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"id": "de95cf77",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['▁This',\n",
	" '▁is',\n",
	" '▁the',\n",
	" '▁Hugging',\n",
	" '▁Face',\n",
	" '▁',\n",
	" 'c',\n",
	" 'ou',\n",
	" 'r',\n",
	" 's',\n",
	" 'e',\n",
	" '.']"
	]
	},
	"execution_count": 26,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"tokenize(\"This is the Hugging Face course.\", model)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "03270dbe",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}