Skip to content

Instantly share code, notes, and snippets.

@gromgull
Created August 30, 2023 09:58
Show Gist options
  • Save gromgull/da13f68e9768a44fa158900f933d2a94 to your computer and use it in GitHub Desktop.
Save gromgull/da13f68e9768a44fa158900f933d2a94 to your computer and use it in GitHub Desktop.
BPE Encoding Tutorial from Huggingface
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "4d0db2a2",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-08-30 10:13:27.900353: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4424ba5c3b5442f2932858d518ded7a5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading: 0%| | 0.00/665 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a380316b33674902878f918db676cbbf",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading: 0%| | 0.00/1.04M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "906fe17299b147da946166486d4f6f63",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading: 0%| | 0.00/456k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "601bc6160bdd447882ec2769dafa6606",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading: 0%| | 0.00/1.36M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7d429e99",
"metadata": {},
"outputs": [],
"source": [
"corpus = [\n",
" \"This is the Hugging Face Course.\",\n",
" \"This chapter is about tokenization.\",\n",
" \"This section shows several tokenizer algorithbms.\",\n",
" \"Hopefully, you will be able to understand how they are trained and generate tokens.\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "862da2c5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('Gunnar', (0, 6)),\n",
" ('Ġ', (6, 7)),\n",
" ('Ġeats', (7, 12)),\n",
" ('Ċ', (12, 13)),\n",
" ('cake', (13, 17)),\n",
" ('.', (17, 18))]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(\"Gunnar eats\\ncake.\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "be1a4896",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"defaultdict(<class 'int'>, {'This': 3, 'Ġis': 2, 'Ġthe': 1, 'ĠHugging': 1, 'ĠFace': 1, 'ĠCourse': 1, '.': 4, 'Ġchapter': 1, 'Ġabout': 1, 'Ġtokenization': 1, 'Ġsection': 1, 'Ġshows': 1, 'Ġseveral': 1, 'Ġtokenizer': 1, 'Ġalgorithms': 1, 'Hopefully': 1, ',': 1, 'Ġyou': 1, 'Ġwill': 1, 'Ġbe': 1, 'Ġable': 1, 'Ġto': 1, 'Ġunderstand': 1, 'Ġhow': 1, 'Ġthey': 1, 'Ġare': 1, 'Ġtrained': 1, 'Ġand': 1, 'Ġgenerate': 1, 'Ġtokens': 1})\n"
]
}
],
"source": [
"from collections import defaultdict\n",
"\n",
"word_freqs = defaultdict(int)\n",
"\n",
"for text in corpus:\n",
" words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)\n",
" new_words = [word for word, offset in words_with_offsets]\n",
" for word in new_words:\n",
" word_freqs[word] += 1\n",
"\n",
"print(word_freqs)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "263b8335",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ']\n"
]
}
],
"source": [
"alphabet = []\n",
"\n",
"for word in word_freqs.keys():\n",
" for letter in word:\n",
" if letter not in alphabet:\n",
" alphabet.append(letter)\n",
"alphabet.sort()\n",
"\n",
"print(alphabet)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "39678c7c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"31"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab = [\"<|endoftext|>\"] + alphabet.copy()\n",
"len(vocab)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "16c8ecf2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'This': ['T', 'h', 'i', 's'],\n",
" 'Ġis': ['Ġ', 'i', 's'],\n",
" 'Ġthe': ['Ġ', 't', 'h', 'e'],\n",
" 'ĠHugging': ['Ġ', 'H', 'u', 'g', 'g', 'i', 'n', 'g'],\n",
" 'ĠFace': ['Ġ', 'F', 'a', 'c', 'e'],\n",
" 'ĠCourse': ['Ġ', 'C', 'o', 'u', 'r', 's', 'e'],\n",
" '.': ['.'],\n",
" 'Ġchapter': ['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r'],\n",
" 'Ġabout': ['Ġ', 'a', 'b', 'o', 'u', 't'],\n",
" 'Ġtokenization': ['Ġ',\n",
" 't',\n",
" 'o',\n",
" 'k',\n",
" 'e',\n",
" 'n',\n",
" 'i',\n",
" 'z',\n",
" 'a',\n",
" 't',\n",
" 'i',\n",
" 'o',\n",
" 'n'],\n",
" 'Ġsection': ['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n'],\n",
" 'Ġshows': ['Ġ', 's', 'h', 'o', 'w', 's'],\n",
" 'Ġseveral': ['Ġ', 's', 'e', 'v', 'e', 'r', 'a', 'l'],\n",
" 'Ġtokenizer': ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r'],\n",
" 'Ġalgorithms': ['Ġ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's'],\n",
" 'Hopefully': ['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y'],\n",
" ',': [','],\n",
" 'Ġyou': ['Ġ', 'y', 'o', 'u'],\n",
" 'Ġwill': ['Ġ', 'w', 'i', 'l', 'l'],\n",
" 'Ġbe': ['Ġ', 'b', 'e'],\n",
" 'Ġable': ['Ġ', 'a', 'b', 'l', 'e'],\n",
" 'Ġto': ['Ġ', 't', 'o'],\n",
" 'Ġunderstand': ['Ġ', 'u', 'n', 'd', 'e', 'r', 's', 't', 'a', 'n', 'd'],\n",
" 'Ġhow': ['Ġ', 'h', 'o', 'w'],\n",
" 'Ġthey': ['Ġ', 't', 'h', 'e', 'y'],\n",
" 'Ġare': ['Ġ', 'a', 'r', 'e'],\n",
" 'Ġtrained': ['Ġ', 't', 'r', 'a', 'i', 'n', 'e', 'd'],\n",
" 'Ġand': ['Ġ', 'a', 'n', 'd'],\n",
" 'Ġgenerate': ['Ġ', 'g', 'e', 'n', 'e', 'r', 'a', 't', 'e'],\n",
" 'Ġtokens': ['Ġ', 't', 'o', 'k', 'e', 'n', 's']}"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"splits = {word: [c for c in word] for word in word_freqs.keys()}\n",
"splits"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "8bc3aaaf",
"metadata": {},
"outputs": [],
"source": [
"def compute_pair_freqs(splits):\n",
" pair_freqs = defaultdict(int)\n",
" for word, freq in word_freqs.items():\n",
" split = splits[word]\n",
" if len(split) == 1:\n",
" continue\n",
" for i in range(len(split) - 1):\n",
" pair = (split[i], split[i + 1])\n",
" pair_freqs[pair] += freq\n",
" return pair_freqs"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "bf4729a6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('T', 'h'): 3\n",
"('h', 'i'): 3\n",
"('i', 's'): 5\n",
"('Ġ', 'i'): 2\n",
"('Ġ', 't'): 7\n",
"('t', 'h'): 3\n"
]
}
],
"source": [
"pair_freqs = compute_pair_freqs(splits)\n",
"\n",
"for i, key in enumerate(pair_freqs.keys()):\n",
" print(f\"{key}: {pair_freqs[key]}\")\n",
" if i >= 5:\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "b728166b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('Ġ', 't') 7\n"
]
}
],
"source": [
"best_pair = \"\"\n",
"max_freq = None\n",
"\n",
"for pair, freq in pair_freqs.items():\n",
" if max_freq is None or max_freq < freq:\n",
" best_pair = pair\n",
" max_freq = freq\n",
"\n",
"print(best_pair, max_freq)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "1a5c67f7",
"metadata": {},
"outputs": [],
"source": [
"def merge_pair(a, b, splits):\n",
" for word in word_freqs:\n",
" split = splits[word]\n",
" if len(split) == 1:\n",
" continue\n",
"\n",
" i = 0\n",
" while i < len(split) - 1:\n",
" if split[i] == a and split[i + 1] == b:\n",
" split = split[:i] + [a + b] + split[i + 2 :]\n",
" else:\n",
" i += 1\n",
" splits[word] = split\n",
" return splits"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "a246f6a5",
"metadata": {},
"outputs": [],
"source": [
"merges = {(\"Ġ\", \"t\"): \"Ġt\"}\n",
"vocab.append(\"Ġt\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "622788eb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Ġt', 'r', 'a', 'i', 'n', 'e', 'd']\n"
]
}
],
"source": [
"splits = merge_pair(\"Ġ\", \"t\", splits)\n",
"print(splits[\"Ġtrained\"])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "6d857449",
"metadata": {},
"outputs": [],
"source": [
"vocab_size = 50\n",
"\n",
"while len(vocab) < vocab_size:\n",
" pair_freqs = compute_pair_freqs(splits)\n",
" best_pair = \"\"\n",
" max_freq = None\n",
" for pair, freq in pair_freqs.items():\n",
" if max_freq is None or max_freq < freq:\n",
" best_pair = pair\n",
" max_freq = freq\n",
" splits = merge_pair(*best_pair, splits)\n",
" merges[best_pair] = best_pair[0] + best_pair[1]\n",
" vocab.append(best_pair[0] + best_pair[1])"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "5bc2dcfd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{('Ġ', 't'): 'Ġt',\n",
" ('i', 's'): 'is',\n",
" ('e', 'r'): 'er',\n",
" ('Ġ', 'a'): 'Ġa',\n",
" ('Ġt', 'o'): 'Ġto',\n",
" ('e', 'n'): 'en',\n",
" ('T', 'h'): 'Th',\n",
" ('Th', 'is'): 'This',\n",
" ('o', 'u'): 'ou',\n",
" ('s', 'e'): 'se',\n",
" ('Ġto', 'k'): 'Ġtok',\n",
" ('Ġtok', 'en'): 'Ġtoken',\n",
" ('n', 'd'): 'nd',\n",
" ('Ġ', 'is'): 'Ġis',\n",
" ('Ġt', 'h'): 'Ġth',\n",
" ('Ġth', 'e'): 'Ġthe',\n",
" ('i', 'n'): 'in',\n",
" ('Ġa', 'b'): 'Ġab',\n",
" ('Ġtoken', 'i'): 'Ġtokeni'}"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merges"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "44b48140",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['<|endoftext|>',\n",
" ',',\n",
" '.',\n",
" 'C',\n",
" 'F',\n",
" 'H',\n",
" 'T',\n",
" 'a',\n",
" 'b',\n",
" 'c',\n",
" 'd',\n",
" 'e',\n",
" 'f',\n",
" 'g',\n",
" 'h',\n",
" 'i',\n",
" 'k',\n",
" 'l',\n",
" 'm',\n",
" 'n',\n",
" 'o',\n",
" 'p',\n",
" 'r',\n",
" 's',\n",
" 't',\n",
" 'u',\n",
" 'v',\n",
" 'w',\n",
" 'y',\n",
" 'z',\n",
" 'Ġ',\n",
" 'Ġt',\n",
" 'is',\n",
" 'er',\n",
" 'Ġa',\n",
" 'Ġto',\n",
" 'en',\n",
" 'Th',\n",
" 'This',\n",
" 'ou',\n",
" 'se',\n",
" 'Ġtok',\n",
" 'Ġtoken',\n",
" 'nd',\n",
" 'Ġis',\n",
" 'Ġth',\n",
" 'Ġthe',\n",
" 'in',\n",
" 'Ġab',\n",
" 'Ġtokeni']"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "cf629b1a",
"metadata": {},
"outputs": [],
"source": [
"def tokenize(text):\n",
" pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)\n",
" pre_tokenized_text = [word for word, offset in pre_tokenize_result]\n",
" splits = [[l for l in word] for word in pre_tokenized_text]\n",
" for pair, merge in merges.items():\n",
" for idx, split in enumerate(splits):\n",
" i = 0\n",
" while i < len(split) - 1:\n",
" if split[i] == pair[0] and split[i + 1] == pair[1]:\n",
" split = split[:i] + [merge] + split[i + 2 :]\n",
" else:\n",
" i += 1\n",
" splits[idx] = split\n",
"\n",
" return sum(splits, [])"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "e3bea0aa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['This', 'Ġis', 'Ġ', 'n', 'o', 't', 'Ġa', 'Ġtoken', '.']"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenize(\"This is not a token.\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f7a1455e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment