Skip to content

Instantly share code, notes, and snippets.

@benathi
Last active October 1, 2023 02:21
Show Gist options
  • Save benathi/90fe8be8c939d0c2baf9412204bbd7a8 to your computer and use it in GitHub Desktop.
Save benathi/90fe8be8c939d0c2baf9412204bbd7a8 to your computer and use it in GitHub Desktop.
Tiktoken demo
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d9d06852-51f8-4891-b2b6-e5f9e7c9f5b0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting with individual bytes of the input (parts) = [b'h', b'e', b'l', b'l', b'o']\n",
"\u001b[48;5;167mh\u001b[48;5;179me\u001b[48;5;185ml\u001b[48;5;77ml\u001b[48;5;80mo\u001b[0m\n",
"Merging b'e' and b'l' since b'el' has the lowest rank (301) out of all pairs\n",
"\n",
"\u001b[48;5;167mh\u001b[48;5;179mel\u001b[48;5;77ml\u001b[48;5;80mo\u001b[0m\n",
"Merging b'l' and b'o' since b'lo' has the lowest rank (385) out of all pairs\n",
"\n",
"\u001b[48;5;167mh\u001b[48;5;179mel\u001b[48;5;77mlo\u001b[0m\n",
"Merging b'el' and b'lo' since b'ello' has the lowest rank (4896) out of all pairs\n",
"\n",
"\u001b[48;5;167mh\u001b[48;5;179mello\u001b[0m\n",
"Merging b'h' and b'ello' since b'hello' has the lowest rank (15339) out of all pairs\n",
"\n",
"\u001b[48;5;167mhello\u001b[0m\n",
"No pair is found in the vocabulary. BPE step finished for pre-token b'hello'!\n",
"\n",
"\n",
"\n",
"Starting with individual bytes of the input (parts) = [b' ', b'w', b'o', b'r', b'l', b'd', b'd', b'd', b'd', b'd']\n",
"\u001b[48;5;167m \u001b[48;5;179mw\u001b[48;5;185mo\u001b[48;5;77mr\u001b[48;5;80ml\u001b[48;5;68md\u001b[48;5;134md\u001b[48;5;167md\u001b[48;5;179md\u001b[48;5;185md\u001b[0m\n",
"Merging b'o' and b'r' since b'or' has the lowest rank (269) out of all pairs\n",
"\n",
"\u001b[48;5;167m \u001b[48;5;179mw\u001b[48;5;185mor\u001b[48;5;80ml\u001b[48;5;68md\u001b[48;5;134md\u001b[48;5;167md\u001b[48;5;179md\u001b[48;5;185md\u001b[0m\n",
"Merging b' ' and b'w' since b' w' has the lowest rank (289) out of all pairs\n",
"\n",
"\u001b[48;5;167m w\u001b[48;5;185mor\u001b[48;5;80ml\u001b[48;5;68md\u001b[48;5;134md\u001b[48;5;167md\u001b[48;5;179md\u001b[48;5;185md\u001b[0m\n",
"Merging b'l' and b'd' since b'ld' has the lowest rank (509) out of all pairs\n",
"\n",
"\u001b[48;5;167m w\u001b[48;5;185mor\u001b[48;5;80mld\u001b[48;5;134md\u001b[48;5;167md\u001b[48;5;179md\u001b[48;5;185md\u001b[0m\n",
"Merging b'd' and b'd' since b'dd' has the lowest rank (634) out of all pairs\n",
"\n",
"\u001b[48;5;167m w\u001b[48;5;185mor\u001b[48;5;80mld\u001b[48;5;134mdd\u001b[48;5;179md\u001b[48;5;185md\u001b[0m\n",
"Merging b'd' and b'd' since b'dd' has the lowest rank (634) out of all pairs\n",
"\n",
"\u001b[48;5;167m w\u001b[48;5;185mor\u001b[48;5;80mld\u001b[48;5;134mdd\u001b[48;5;179mdd\u001b[0m\n",
"Merging b'or' and b'ld' since b'orld' has the lowest rank (1410) out of all pairs\n",
"\n",
"\u001b[48;5;167m w\u001b[48;5;185morld\u001b[48;5;134mdd\u001b[48;5;179mdd\u001b[0m\n",
"Merging b' w' and b'orld' since b' world' has the lowest rank (1917) out of all pairs\n",
"\n",
"\u001b[48;5;167m world\u001b[48;5;134mdd\u001b[48;5;179mdd\u001b[0m\n",
"Merging b'dd' and b'dd' since b'dddd' has the lowest rank (65200) out of all pairs\n",
"\n",
"\u001b[48;5;167m world\u001b[48;5;134mdddd\u001b[0m\n",
"No pair is found in the vocabulary. BPE step finished for pre-token b' worlddddd'!\n",
"\n",
"\n",
"\n"
]
},
{
"data": {
"text/plain": [
"['hello', ' world', 'dddd']"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from tiktoken._educational import *\n",
"enc = SimpleBytePairEncoding.from_tiktoken(\"cl100k_base\")\n",
"[enc.decode([x]) for x in enc.encode(\"hello worlddddd\")]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "77246607-5ec4-4c7f-ac47-e670dda21fbf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting with individual bytes of the input (parts) = [b'\\xf0', b'\\x9f', b'\\x90', b'\\xb1']\n",
"\u001b[48;5;167m�\u001b[48;5;179m�\u001b[48;5;185m�\u001b[48;5;77m�\u001b[0m\n",
"Merging b'\\xf0' and b'\\x9f' since b'\\xf0\\x9f' has the lowest rank (9468) out of all pairs\n",
"\n",
"\u001b[48;5;167m�\u001b[48;5;179m�\u001b[48;5;185m�\u001b[0m\n",
"No pair is found in the vocabulary. BPE step finished for pre-token b'\\xf0\\x9f\\x90\\xb1'!\n",
"\n",
"\n",
"\n",
"[9468, 238, 109]\n",
"🐱\n"
]
}
],
"source": [
"x = enc.encode(\"🐱\")\n",
"print(x)\n",
"print(enc.decode(x))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "883e3267-d19e-4bdd-ba2d-fe060a1c0826",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting with individual bytes of the input (parts) = [b'\\xe3', b'\\x82', b'\\xab']\n",
"\u001b[48;5;167m�\u001b[48;5;179m�\u001b[48;5;185m�\u001b[0m\n",
"Merging b'\\xe3' and b'\\x82' since b'\\xe3\\x82' has the lowest rank (3484) out of all pairs\n",
"\n",
"\u001b[48;5;167m�\u001b[48;5;179m�\u001b[0m\n",
"Merging b'\\xe3\\x82' and b'\\xab' since b'\\xe3\\x82\\xab' has the lowest rank (71493) out of all pairs\n",
"\n",
"\u001b[48;5;167mカ\u001b[0m\n",
"No pair is found in the vocabulary. BPE step finished for pre-token b'\\xe3\\x82\\xab'!\n",
"\n",
"\n",
"\n",
"[71493]\n",
"カ\n"
]
}
],
"source": [
"x = enc.encode(\"カ\")\n",
"print(x)\n",
"print(enc.decode(x))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b93cb459-4234-43c3-ac13-490f9f624555",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment