Skip to content

Instantly share code, notes, and snippets.

@gromgull
Last active August 30, 2023 10:00
Show Gist options
  • Save gromgull/58981f4f76719b6e7eb0508cdf8f990d to your computer and use it in GitHub Desktop.
Save gromgull/58981f4f76719b6e7eb0508cdf8f990d to your computer and use it in GitHub Desktop.
Unigram Tokenizer Tutorial from Huggingface
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "1ddf9f86",
"metadata": {},
"outputs": [],
"source": [
"corpus = [\n",
" \"This is the Hugging Face Course.\",\n",
" \"This chapter is about tokenization.\",\n",
" \"This section shows several tokenizer algorithms.\",\n",
" \"Hopefully, you will be able to understand how they are trained and generate tokens.\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "eab45427",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-08-30 10:25:33.079067: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "50f14aa634744fa7a997fd55af6d0cc5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading: 0%| | 0.00/760 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "21c4921369f84c14b187b9bfec36d933",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading: 0%| | 0.00/798k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "484f586a5acb4d7da04d7a3f10cc99c1",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading: 0%| | 0.00/1.38M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(\"xlnet-base-cased\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "534bc241",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(int,\n",
" {'▁This': 3,\n",
" '▁is': 2,\n",
" '▁the': 1,\n",
" '▁Hugging': 1,\n",
" '▁Face': 1,\n",
" '▁Course.': 1,\n",
" '▁chapter': 1,\n",
" '▁about': 1,\n",
" '▁tokenization.': 1,\n",
" '▁section': 1,\n",
" '▁shows': 1,\n",
" '▁several': 1,\n",
" '▁tokenizer': 1,\n",
" '▁algorithms.': 1,\n",
" '▁Hopefully,': 1,\n",
" '▁you': 1,\n",
" '▁will': 1,\n",
" '▁be': 1,\n",
" '▁able': 1,\n",
" '▁to': 1,\n",
" '▁understand': 1,\n",
" '▁how': 1,\n",
" '▁they': 1,\n",
" '▁are': 1,\n",
" '▁trained': 1,\n",
" '▁and': 1,\n",
" '▁generate': 1,\n",
" '▁tokens.': 1})"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from collections import defaultdict\n",
"\n",
"word_freqs = defaultdict(int)\n",
"for text in corpus:\n",
" words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)\n",
" new_words = [word for word, offset in words_with_offsets]\n",
" for word in new_words:\n",
" word_freqs[word] += 1\n",
"\n",
"word_freqs"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "328b80c2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('▁t', 7),\n",
" ('is', 5),\n",
" ('er', 5),\n",
" ('▁a', 5),\n",
" ('▁to', 4),\n",
" ('to', 4),\n",
" ('en', 4),\n",
" ('▁T', 3),\n",
" ('▁Th', 3),\n",
" ('▁Thi', 3)]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"char_freqs = defaultdict(int)\n",
"subwords_freqs = defaultdict(int)\n",
"for word, freq in word_freqs.items():\n",
" for i in range(len(word)):\n",
" char_freqs[word[i]] += freq\n",
" # Loop through the subwords of length at least 2\n",
" for j in range(i + 2, len(word) + 1):\n",
" subwords_freqs[word[i:j]] += freq\n",
"\n",
"# Sort subwords by frequency\n",
"sorted_subwords = sorted(subwords_freqs.items(), key=lambda x: x[1], reverse=True)\n",
"sorted_subwords[:10]\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "d70bb8d1",
"metadata": {},
"outputs": [],
"source": [
"token_freqs = list(char_freqs.items()) + sorted_subwords[: 300 - len(char_freqs)]\n",
"token_freqs = {token: freq for token, freq in token_freqs}"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "ec931454",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'▁': 31,\n",
" 'T': 3,\n",
" 'h': 9,\n",
" 'i': 13,\n",
" 's': 13,\n",
" 't': 14,\n",
" 'e': 21,\n",
" 'H': 2,\n",
" 'u': 6,\n",
" 'g': 5,\n",
" 'n': 11,\n",
" 'F': 1,\n",
" 'a': 12,\n",
" 'c': 3,\n",
" 'C': 1,\n",
" 'o': 13,\n",
" 'r': 9,\n",
" '.': 4,\n",
" 'p': 2,\n",
" 'b': 3,\n",
" 'k': 3,\n",
" 'z': 2,\n",
" 'w': 3,\n",
" 'v': 1,\n",
" 'l': 7,\n",
" 'm': 1,\n",
" 'f': 1,\n",
" 'y': 3,\n",
" ',': 1,\n",
" 'd': 4,\n",
" '▁t': 7,\n",
" 'is': 5,\n",
" 'er': 5,\n",
" '▁a': 5,\n",
" '▁to': 4,\n",
" 'to': 4,\n",
" 'en': 4,\n",
" '▁T': 3,\n",
" '▁Th': 3,\n",
" '▁Thi': 3,\n",
" '▁This': 3,\n",
" 'Th': 3,\n",
" 'Thi': 3,\n",
" 'This': 3,\n",
" 'hi': 3,\n",
" 'his': 3,\n",
" 'th': 3,\n",
" 'ou': 3,\n",
" 'se': 3,\n",
" '▁tok': 3,\n",
" '▁toke': 3,\n",
" '▁token': 3,\n",
" 'tok': 3,\n",
" 'toke': 3,\n",
" 'token': 3,\n",
" 'ok': 3,\n",
" 'oke': 3,\n",
" 'oken': 3,\n",
" 'ke': 3,\n",
" 'ken': 3,\n",
" '▁s': 3,\n",
" 'ra': 3,\n",
" 'nd': 3,\n",
" '▁i': 2,\n",
" '▁is': 2,\n",
" '▁th': 2,\n",
" '▁the': 2,\n",
" 'the': 2,\n",
" 'he': 2,\n",
" '▁H': 2,\n",
" 'in': 2,\n",
" 'rs': 2,\n",
" 'te': 2,\n",
" '▁ab': 2,\n",
" 'ab': 2,\n",
" '▁tokeni': 2,\n",
" '▁tokeniz': 2,\n",
" 'tokeni': 2,\n",
" 'tokeniz': 2,\n",
" 'okeni': 2,\n",
" 'okeniz': 2,\n",
" 'keni': 2,\n",
" 'keniz': 2,\n",
" 'eni': 2,\n",
" 'eniz': 2,\n",
" 'ni': 2,\n",
" 'niz': 2,\n",
" 'iz': 2,\n",
" 'at': 2,\n",
" 'ti': 2,\n",
" 'tio': 2,\n",
" 'tion': 2,\n",
" 'io': 2,\n",
" 'ion': 2,\n",
" 'on': 2,\n",
" '▁se': 2,\n",
" 'ho': 2,\n",
" 'how': 2,\n",
" 'ow': 2,\n",
" 'era': 2,\n",
" 'al': 2,\n",
" 's.': 2,\n",
" 'll': 2,\n",
" 'an': 2,\n",
" 'and': 2,\n",
" 'ne': 2,\n",
" '▁Hu': 1,\n",
" '▁Hug': 1,\n",
" '▁Hugg': 1,\n",
" '▁Huggi': 1,\n",
" '▁Huggin': 1,\n",
" '▁Hugging': 1,\n",
" 'Hu': 1,\n",
" 'Hug': 1,\n",
" 'Hugg': 1,\n",
" 'Huggi': 1,\n",
" 'Huggin': 1,\n",
" 'Hugging': 1,\n",
" 'ug': 1,\n",
" 'ugg': 1,\n",
" 'uggi': 1,\n",
" 'uggin': 1,\n",
" 'ugging': 1,\n",
" 'gg': 1,\n",
" 'ggi': 1,\n",
" 'ggin': 1,\n",
" 'gging': 1,\n",
" 'gi': 1,\n",
" 'gin': 1,\n",
" 'ging': 1,\n",
" 'ing': 1,\n",
" 'ng': 1,\n",
" '▁F': 1,\n",
" '▁Fa': 1,\n",
" '▁Fac': 1,\n",
" '▁Face': 1,\n",
" 'Fa': 1,\n",
" 'Fac': 1,\n",
" 'Face': 1,\n",
" 'ac': 1,\n",
" 'ace': 1,\n",
" 'ce': 1,\n",
" '▁C': 1,\n",
" '▁Co': 1,\n",
" '▁Cou': 1,\n",
" '▁Cour': 1,\n",
" '▁Cours': 1,\n",
" '▁Course': 1,\n",
" '▁Course.': 1,\n",
" 'Co': 1,\n",
" 'Cou': 1,\n",
" 'Cour': 1,\n",
" 'Cours': 1,\n",
" 'Course': 1,\n",
" 'Course.': 1,\n",
" 'our': 1,\n",
" 'ours': 1,\n",
" 'ourse': 1,\n",
" 'ourse.': 1,\n",
" 'ur': 1,\n",
" 'urs': 1,\n",
" 'urse': 1,\n",
" 'urse.': 1,\n",
" 'rse': 1,\n",
" 'rse.': 1,\n",
" 'se.': 1,\n",
" 'e.': 1,\n",
" '▁c': 1,\n",
" '▁ch': 1,\n",
" '▁cha': 1,\n",
" '▁chap': 1,\n",
" '▁chapt': 1,\n",
" '▁chapte': 1,\n",
" '▁chapter': 1,\n",
" 'ch': 1,\n",
" 'cha': 1,\n",
" 'chap': 1,\n",
" 'chapt': 1,\n",
" 'chapte': 1,\n",
" 'chapter': 1,\n",
" 'ha': 1,\n",
" 'hap': 1,\n",
" 'hapt': 1,\n",
" 'hapte': 1,\n",
" 'hapter': 1,\n",
" 'ap': 1,\n",
" 'apt': 1,\n",
" 'apte': 1,\n",
" 'apter': 1,\n",
" 'pt': 1,\n",
" 'pte': 1,\n",
" 'pter': 1,\n",
" 'ter': 1,\n",
" '▁abo': 1,\n",
" '▁abou': 1,\n",
" '▁about': 1,\n",
" 'abo': 1,\n",
" 'abou': 1,\n",
" 'about': 1,\n",
" 'bo': 1,\n",
" 'bou': 1,\n",
" 'bout': 1,\n",
" 'out': 1,\n",
" 'ut': 1,\n",
" '▁tokeniza': 1,\n",
" '▁tokenizat': 1,\n",
" '▁tokenizati': 1,\n",
" '▁tokenizatio': 1,\n",
" '▁tokenization': 1,\n",
" '▁tokenization.': 1,\n",
" 'tokeniza': 1,\n",
" 'tokenizat': 1,\n",
" 'tokenizati': 1,\n",
" 'tokenizatio': 1,\n",
" 'tokenization': 1,\n",
" 'tokenization.': 1,\n",
" 'okeniza': 1,\n",
" 'okenizat': 1,\n",
" 'okenizati': 1,\n",
" 'okenizatio': 1,\n",
" 'okenization': 1,\n",
" 'okenization.': 1,\n",
" 'keniza': 1,\n",
" 'kenizat': 1,\n",
" 'kenizati': 1,\n",
" 'kenizatio': 1,\n",
" 'kenization': 1,\n",
" 'kenization.': 1,\n",
" 'eniza': 1,\n",
" 'enizat': 1,\n",
" 'enizati': 1,\n",
" 'enizatio': 1,\n",
" 'enization': 1,\n",
" 'enization.': 1,\n",
" 'niza': 1,\n",
" 'nizat': 1,\n",
" 'nizati': 1,\n",
" 'nizatio': 1,\n",
" 'nization': 1,\n",
" 'nization.': 1,\n",
" 'iza': 1,\n",
" 'izat': 1,\n",
" 'izati': 1,\n",
" 'izatio': 1,\n",
" 'ization': 1,\n",
" 'ization.': 1,\n",
" 'za': 1,\n",
" 'zat': 1,\n",
" 'zati': 1,\n",
" 'zatio': 1,\n",
" 'zation': 1,\n",
" 'zation.': 1,\n",
" 'ati': 1,\n",
" 'atio': 1,\n",
" 'ation': 1,\n",
" 'ation.': 1,\n",
" 'tion.': 1,\n",
" 'ion.': 1,\n",
" 'on.': 1,\n",
" 'n.': 1,\n",
" '▁sec': 1,\n",
" '▁sect': 1,\n",
" '▁secti': 1,\n",
" '▁sectio': 1,\n",
" '▁section': 1,\n",
" 'sec': 1,\n",
" 'sect': 1,\n",
" 'secti': 1,\n",
" 'sectio': 1,\n",
" 'section': 1,\n",
" 'ec': 1,\n",
" 'ect': 1,\n",
" 'ecti': 1,\n",
" 'ectio': 1,\n",
" 'ection': 1,\n",
" 'ct': 1,\n",
" 'cti': 1,\n",
" 'ctio': 1,\n",
" 'ction': 1,\n",
" '▁sh': 1,\n",
" '▁sho': 1,\n",
" '▁show': 1,\n",
" '▁shows': 1,\n",
" 'sh': 1,\n",
" 'sho': 1,\n",
" 'show': 1,\n",
" 'shows': 1,\n",
" 'hows': 1,\n",
" 'ows': 1,\n",
" 'ws': 1,\n",
" '▁sev': 1,\n",
" '▁seve': 1,\n",
" '▁sever': 1,\n",
" '▁severa': 1,\n",
" '▁several': 1,\n",
" 'sev': 1,\n",
" 'seve': 1,\n",
" 'sever': 1,\n",
" 'severa': 1,\n",
" 'several': 1}"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"token_freqs"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "c8622b4f",
"metadata": {},
"outputs": [],
"source": [
"from math import log\n",
"\n",
"total_sum = sum([freq for token, freq in token_freqs.items()])\n",
"model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "65abfe44",
"metadata": {},
"outputs": [],
"source": [
"def encode_word(word, model):\n",
" best_segmentations = [{\"start\": 0, \"score\": 1}] + [\n",
" {\"start\": None, \"score\": None} for _ in range(len(word))\n",
" ]\n",
" for start_idx in range(len(word)):\n",
" # This should be properly filled by the previous steps of the loop\n",
" best_score_at_start = best_segmentations[start_idx][\"score\"]\n",
" for end_idx in range(start_idx + 1, len(word) + 1):\n",
" token = word[start_idx:end_idx]\n",
" if token in model and best_score_at_start is not None:\n",
" score = model[token] + best_score_at_start\n",
" # If we have found a better segmentation ending at end_idx, we update\n",
" if (\n",
" best_segmentations[end_idx][\"score\"] is None\n",
" or best_segmentations[end_idx][\"score\"] > score\n",
" ):\n",
" best_segmentations[end_idx] = {\"start\": start_idx, \"score\": score}\n",
"\n",
" segmentation = best_segmentations[-1]\n",
" if segmentation[\"score\"] is None:\n",
" # We did not find a tokenization of the word -> unknown\n",
" return [\"<unk>\"], None\n",
"\n",
" score = segmentation[\"score\"]\n",
" start = segmentation[\"start\"]\n",
" end = len(word)\n",
" tokens = []\n",
" while start != 0:\n",
" tokens.insert(0, word[start:end])\n",
" next_start = best_segmentations[start][\"start\"]\n",
" end = start\n",
" start = next_start\n",
" tokens.insert(0, word[start:end])\n",
" return tokens, score"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "b3f4d087",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(['H', 'o', 'p', 'e', 'f', 'u', 'll', 'y'], 41.5157494601402)\n",
"(['This'], 6.288267030694535)\n"
]
}
],
"source": [
"print(encode_word(\"Hopefully\", model))\n",
"print(encode_word(\"This\", model))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "d0e6991d",
"metadata": {},
"outputs": [],
"source": [
"def compute_loss(model):\n",
" loss = 0\n",
" for word, freq in word_freqs.items():\n",
" _, word_loss = encode_word(word, model)\n",
" loss += freq * word_loss\n",
" return loss"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "8681ebbb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"413.10377642940875"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"compute_loss(model)\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "2cd27f4c",
"metadata": {},
"outputs": [],
"source": [
"import copy\n",
"\n",
"\n",
"def compute_scores(model):\n",
" scores = {}\n",
" model_loss = compute_loss(model)\n",
" for token, score in model.items():\n",
" # We always keep tokens of length 1\n",
" if len(token) == 1:\n",
" continue\n",
" model_without_token = copy.deepcopy(model)\n",
" _ = model_without_token.pop(token)\n",
" scores[token] = compute_loss(model_without_token) - model_loss\n",
" return scores"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "3a42bd3c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"6.376412403623874\n",
"0.0\n"
]
}
],
"source": [
"scores = compute_scores(model)\n",
"print(scores[\"ll\"])\n",
"print(scores[\"his\"])"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "53c09fbc",
"metadata": {},
"outputs": [],
"source": [
"percent_to_remove = 0.1\n",
"while len(model) > 100:\n",
" scores = compute_scores(model)\n",
" sorted_scores = sorted(scores.items(), key=lambda x: x[1])\n",
" # Remove percent_to_remove tokens with the lowest scores.\n",
" for i in range(int(len(model) * percent_to_remove)):\n",
" _ = token_freqs.pop(sorted_scores[i][0])\n",
"\n",
" total_sum = sum([freq for token, freq in token_freqs.items()])\n",
" model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "84633d43",
"metadata": {},
"outputs": [],
"source": [
"def tokenize(text, model):\n",
" words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)\n",
" pre_tokenized_text = [word for word, offset in words_with_offsets]\n",
" encoded_words = [encode_word(word, model)[0] for word in pre_tokenized_text]\n",
" return sum(encoded_words, [])\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "de95cf77",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['▁This',\n",
" '▁is',\n",
" '▁the',\n",
" '▁Hugging',\n",
" '▁Face',\n",
" '▁',\n",
" 'c',\n",
" 'ou',\n",
" 'r',\n",
" 's',\n",
" 'e',\n",
" '.']"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenize(\"This is the Hugging Face course.\", model)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "03270dbe",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment