Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save bigsnarfdude/fcaa2d444e9d8f2f961d0f9314bdb404 to your computer and use it in GitHub Desktop.
Save bigsnarfdude/fcaa2d444e9d8f2f961d0f9314bdb404 to your computer and use it in GitHub Desktop.
tiktoken_conversion_huggingface.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"source": [
"### Install requirements"
],
"metadata": {
"id": "2krxXyYOEsAj"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qD4__VYRE9ep"
},
"outputs": [],
"source": [
"!pip install -q tiktoken transformers"
]
},
{
"cell_type": "markdown",
"source": [
"### Setup"
],
"metadata": {
"id": "OcCezqFbEvVN"
}
},
{
"cell_type": "code",
"source": [
"\n",
"# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb\n",
"MODEL_INFO = {\n",
" # GPT-2 and GPT-3 models (r50k_base)\n",
" 'gpt2': {\n",
" 'tokenizer_class': 'GPT2Tokenizer',\n",
" 'model_max_length': 1024,\n",
" },\n",
" 'davinci': { # (gpt-3)\n",
" 'tokenizer_class': 'GPT3Tokenizer',\n",
" 'model_max_length': 2048,\n",
" },\n",
"\n",
" # GPT-3.5 and GPT-4 models (cl100k_base)\n",
" 'gpt-3.5-turbo': {\n",
" 'tokenizer_class': 'GPT3_5Tokenizer',\n",
" 'model_max_length': 4096,\n",
" },\n",
" 'gpt-3.5-turbo-16k': {\n",
" 'tokenizer_class': 'GPT3_5Tokenizer',\n",
" 'model_max_length': 16384,\n",
" },\n",
" 'gpt-4': {\n",
" 'tokenizer_class': 'GPT4Tokenizer',\n",
" 'model_max_length': 8192,\n",
" },\n",
" 'text-embedding-ada-002': {\n",
" 'tokenizer_class': 'GPT4Tokenizer',\n",
" 'model_max_length': 8192,\n",
" },\n",
"\n",
" # Codex models (p50k_base)\n",
" 'text-davinci-002': {\n",
" 'tokenizer_class': 'CodexTokenizer',\n",
" 'model_max_length': 4096,\n",
" },\n",
" 'text-davinci-003': {\n",
" 'tokenizer_class': 'CodexTokenizer',\n",
" 'model_max_length': 4096,\n",
" },\n",
"}\n"
],
"metadata": {
"id": "UuNt2kwgFWbN"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ZypJVeIMFQGQ"
},
"outputs": [],
"source": [
"import json\n",
"import os\n",
"\n",
"import tiktoken\n",
"from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode\n",
"from typing import Dict, Optional\n",
"\n",
"byte_encoder = bytes_to_unicode()\n",
"\n",
"def token_bytes_to_string(b):\n",
" return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])\n",
"\n",
"# Adapted from https://github.com/openai/tiktoken/issues/60#issuecomment-1499977960\n",
"def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:\n",
" parts = [bytes([b]) for b in token]\n",
" while True:\n",
" min_idx = None\n",
" min_rank = None\n",
" for i, pair in enumerate(zip(parts[:-1], parts[1:])):\n",
" rank = mergeable_ranks.get(pair[0] + pair[1])\n",
" if rank is not None and (min_rank is None or rank < min_rank):\n",
" min_idx = i\n",
" min_rank = rank\n",
" if min_rank is None or (max_rank is not None and min_rank >= max_rank):\n",
" break\n",
" assert min_idx is not None\n",
" parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]\n",
" return parts\n",
"\n",
"def generate_vocab_and_merges(encoder):\n",
" mergeable_ranks = encoder._mergeable_ranks\n",
"\n",
" merges = []\n",
" vocab = {}\n",
" for token, rank in mergeable_ranks.items():\n",
" vocab[token_bytes_to_string(token)] = rank\n",
"\n",
" if len(token) == 1:\n",
" continue\n",
" merged = tuple(bpe(mergeable_ranks, token, max_rank=rank))\n",
" assert len(merged) == 2\n",
"\n",
" merges.append(' '.join(map(token_bytes_to_string, merged)))\n",
"\n",
" # Also add special tokens\n",
" vocab.update(encoder._special_tokens)\n",
"\n",
" return vocab, merges\n",
"\n",
"def convert_tiktoken(model_name, output_dir=None):\n",
" if output_dir is None:\n",
" output_dir = model_name\n",
"\n",
" encoder = tiktoken.encoding_for_model(model_name)\n",
"\n",
" vocab, merges = generate_vocab_and_merges(encoder)\n",
"\n",
" added_tokens = [\n",
" {\n",
" \"id\": id,\n",
" \"content\": content,\n",
" \"single_word\": False,\n",
" \"lstrip\": False,\n",
" \"rstrip\": False,\n",
" \"normalized\": False,\n",
" \"special\": True,\n",
" }\n",
" for content, id in encoder._special_tokens.items()\n",
" ]\n",
"\n",
" # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer_config.json\n",
" tokenizer_config_template = {\n",
" \"add_prefix_space\": False,\n",
" \"bos_token\": \"<|endoftext|>\",\n",
" \"clean_up_tokenization_spaces\": False,\n",
" \"eos_token\": \"<|endoftext|>\",\n",
" \"unk_token\": \"<|endoftext|>\",\n",
" }\n",
" tokenizer_config_template.update(MODEL_INFO[model_name]) # Adds `model_max_length` and `tokenizer_class`\n",
" tokenizer_config_template = dict(sorted(tokenizer_config_template.items(), key=lambda x: x[0]))\n",
"\n",
" os.makedirs(output_dir, exist_ok=True)\n",
"\n",
" if MODEL_INFO[model_name]['tokenizer_class'] in ('GPT3_5Tokenizer', 'GPT4Tokenizer'):\n",
" pre_tokenizer = {\n",
" \"type\": \"Sequence\",\n",
" \"pretokenizers\": [\n",
" {\n",
" \"type\": \"Split\",\n",
" \"pattern\": {\n",
" \"Regex\": \"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\\\r\\\\n\\\\p{L}\\\\p{N}]?\\\\p{L}+|\\\\p{N}{1,3}| ?[^\\\\s\\\\p{L}\\\\p{N}]+[\\\\r\\\\n]*|\\\\s*[\\\\r\\\\n]+|\\\\s+(?!\\\\S)|\\\\s+\"\n",
" },\n",
" \"behavior\": \"Removed\",\n",
" \"invert\": True,\n",
" },\n",
" {\n",
" \"type\": \"ByteLevel\",\n",
" \"add_prefix_space\": False,\n",
" \"trim_offsets\": True,\n",
" \"use_regex\": False,\n",
" }\n",
" ]\n",
" }\n",
" else:\n",
" pre_tokenizer = {\n",
" \"type\": \"ByteLevel\",\n",
" \"add_prefix_space\": False,\n",
" \"trim_offsets\": True,\n",
" \"use_regex\": True,\n",
" }\n",
"\n",
" # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer.json\n",
" tokenizer_template = {\n",
" \"version\": \"1.0\",\n",
" \"truncation\": None,\n",
" \"padding\": None,\n",
" \"added_tokens\": added_tokens,\n",
" \"normalizer\": None,\n",
" \"pre_tokenizer\": pre_tokenizer,\n",
" \"post_processor\": None,\n",
" \"decoder\": {\n",
" \"type\": \"ByteLevel\",\n",
" \"add_prefix_space\": True,\n",
" \"trim_offsets\": True,\n",
" \"use_regex\": True,\n",
" },\n",
" \"model\": {\n",
" \"type\": \"BPE\",\n",
" \"dropout\": None,\n",
" \"unk_token\": None,\n",
" \"continuing_subword_prefix\": \"\",\n",
" \"end_of_word_suffix\": \"\",\n",
" \"fuse_unk\": False,\n",
" \"byte_fallback\": False,\n",
" \"vocab\": vocab,\n",
" \"merges\": merges,\n",
" },\n",
" }\n",
"\n",
"\n",
" # Save to files\n",
" with open(os.path.join(output_dir, 'vocab.json'), 'w', encoding='utf-8') as fp:\n",
" json.dump(vocab, fp, indent=2, ensure_ascii=False)\n",
"\n",
" with open(os.path.join(output_dir, 'tokenizer.json'), 'w', encoding='utf-8') as fp:\n",
" json.dump(tokenizer_template, fp, indent=2, ensure_ascii=False)\n",
"\n",
" with open(os.path.join(output_dir, 'tokenizer_config.json'), 'w', encoding='utf-8') as fp:\n",
" json.dump(tokenizer_config_template, fp, indent=2, ensure_ascii=False)\n",
"\n",
" with open(os.path.join(output_dir, 'special_tokens_map.json'), 'w', encoding='utf-8') as fp:\n",
" json.dump({\n",
" \"bos_token\": \"<|endoftext|>\",\n",
" \"eos_token\": \"<|endoftext|>\",\n",
" \"unk_token\": \"<|endoftext|>\",\n",
" }, fp, indent=2, ensure_ascii=False)\n",
"\n",
" with open(os.path.join(output_dir, 'merges.txt'), 'w', encoding='utf-8') as fp:\n",
" fp.write('#version: 0.2\\n')\n",
" fp.write('\\n'.join(merges))"
]
},
{
"cell_type": "markdown",
"source": [
"### Run conversion"
],
"metadata": {
"id": "wfuFCZRbFMT_"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "O87Zz6Vzhb5C"
},
"outputs": [],
"source": [
"output = 'models'\n",
"for model_name in MODEL_INFO:\n",
" convert_tiktoken(model_name, os.path.join(output, model_name))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "qx6tfE_UwFNB"
},
"source": [
"### Validation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "oSRUBMLmwatB"
},
"outputs": [],
"source": [
"# Tests adapted from https://github.com/openai/tiktoken/blob/1b9faf2779855124f05174adf1383e53689ed94b/tests/test_encoding.py\n",
"TESTS = [\n",
" \"\\n\\n\\n\\n\\ns1232\", \"hello world\", \"hello <|endoftext|>\", \"hello world\", \"hello <|endoftext|>\", \"0\", \"00\", \"000\", \"0000\", \"00000\", \"000000\", \"0000000\", \"00000000\", \"000000000\", \"0000000000\", \"00000000000\", \"000000000000\", \"0000000000000\", \"00000000000000\", \"000000000000000\", \"0000000000000000\", \"00000000000000000\", \"rer\", \"'rer\", \"today\\n \", \"today\\n \\n\", \"today\\n \\n\", \"hello world\", \"hello world\", \"hello world\", \" \\x850\", \"\", \"👍\", \" .\",\n",
"]"
]
},
{
"cell_type": "code",
"source": [
"from transformers import GPT2TokenizerFast, logging\n",
"\n",
"# Hide warning messages\n",
"logging.set_verbosity_error()\n",
"\n",
"output = 'models'\n",
"for model_name in MODEL_INFO:\n",
" print('Testing', model_name)\n",
" og_tokenizer = tiktoken.encoding_for_model(model_name)\n",
" hf_tokenizer = GPT2TokenizerFast.from_pretrained(os.path.join(output, model_name))\n",
"\n",
" for test in TESTS:\n",
" # Test encoding\n",
" og_tokens = og_tokenizer.encode(test, allowed_special={'<|endoftext|>'})\n",
" hf_tokens = hf_tokenizer.encode(test)\n",
" assert og_tokens == hf_tokens, f'ENCODE FAIL: \"{test}\". {og_tokens} != {hf_tokens}'\n",
"\n",
" # Test decoding\n",
" og_decoded = og_tokenizer.decode(og_tokens)\n",
" hf_decoded = hf_tokenizer.decode(hf_tokens)\n",
" assert og_decoded == hf_decoded, f'DECODE FAIL: \"{og_tokens}\". {og_decoded} != {hf_decoded}'\n"
],
"metadata": {
"id": "ELyGSJM0-yA4"
},
"execution_count": null,
"outputs": []
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment