Created
April 16, 2024 04:52
-
-
Save bigsnarfdude/fcaa2d444e9d8f2f961d0f9314bdb404 to your computer and use it in GitHub Desktop.
tiktoken_conversion_huggingface.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"### Install requirements" | |
], | |
"metadata": { | |
"id": "2krxXyYOEsAj" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "qD4__VYRE9ep" | |
}, | |
"outputs": [], | |
"source": [ | |
"!pip install -q tiktoken transformers" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"### Setup" | |
], | |
"metadata": { | |
"id": "OcCezqFbEvVN" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"\n", | |
"# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb\n", | |
"MODEL_INFO = {\n", | |
" # GPT-2 and GPT-3 models (r50k_base)\n", | |
" 'gpt2': {\n", | |
" 'tokenizer_class': 'GPT2Tokenizer',\n", | |
" 'model_max_length': 1024,\n", | |
" },\n", | |
" 'davinci': { # (gpt-3)\n", | |
" 'tokenizer_class': 'GPT3Tokenizer',\n", | |
" 'model_max_length': 2048,\n", | |
" },\n", | |
"\n", | |
" # GPT-3.5 and GPT-4 models (cl100k_base)\n", | |
" 'gpt-3.5-turbo': {\n", | |
" 'tokenizer_class': 'GPT3_5Tokenizer',\n", | |
" 'model_max_length': 4096,\n", | |
" },\n", | |
" 'gpt-3.5-turbo-16k': {\n", | |
" 'tokenizer_class': 'GPT3_5Tokenizer',\n", | |
" 'model_max_length': 16384,\n", | |
" },\n", | |
" 'gpt-4': {\n", | |
" 'tokenizer_class': 'GPT4Tokenizer',\n", | |
" 'model_max_length': 8192,\n", | |
" },\n", | |
" 'text-embedding-ada-002': {\n", | |
" 'tokenizer_class': 'GPT4Tokenizer',\n", | |
" 'model_max_length': 8192,\n", | |
" },\n", | |
"\n", | |
" # Codex models (p50k_base)\n", | |
" 'text-davinci-002': {\n", | |
" 'tokenizer_class': 'CodexTokenizer',\n", | |
" 'model_max_length': 4096,\n", | |
" },\n", | |
" 'text-davinci-003': {\n", | |
" 'tokenizer_class': 'CodexTokenizer',\n", | |
" 'model_max_length': 4096,\n", | |
" },\n", | |
"}\n" | |
], | |
"metadata": { | |
"id": "UuNt2kwgFWbN" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "ZypJVeIMFQGQ" | |
}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"import os\n", | |
"\n", | |
"import tiktoken\n", | |
"from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode\n", | |
"from typing import Dict, Optional\n", | |
"\n", | |
"byte_encoder = bytes_to_unicode()\n", | |
"\n", | |
"def token_bytes_to_string(b):\n", | |
" return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])\n", | |
"\n", | |
"# Adapted from https://github.com/openai/tiktoken/issues/60#issuecomment-1499977960\n", | |
"def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:\n", | |
" parts = [bytes([b]) for b in token]\n", | |
" while True:\n", | |
" min_idx = None\n", | |
" min_rank = None\n", | |
" for i, pair in enumerate(zip(parts[:-1], parts[1:])):\n", | |
" rank = mergeable_ranks.get(pair[0] + pair[1])\n", | |
" if rank is not None and (min_rank is None or rank < min_rank):\n", | |
" min_idx = i\n", | |
" min_rank = rank\n", | |
" if min_rank is None or (max_rank is not None and min_rank >= max_rank):\n", | |
" break\n", | |
" assert min_idx is not None\n", | |
" parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]\n", | |
" return parts\n", | |
"\n", | |
"def generate_vocab_and_merges(encoder):\n", | |
" mergeable_ranks = encoder._mergeable_ranks\n", | |
"\n", | |
" merges = []\n", | |
" vocab = {}\n", | |
" for token, rank in mergeable_ranks.items():\n", | |
" vocab[token_bytes_to_string(token)] = rank\n", | |
"\n", | |
" if len(token) == 1:\n", | |
" continue\n", | |
" merged = tuple(bpe(mergeable_ranks, token, max_rank=rank))\n", | |
" assert len(merged) == 2\n", | |
"\n", | |
" merges.append(' '.join(map(token_bytes_to_string, merged)))\n", | |
"\n", | |
" # Also add special tokens\n", | |
" vocab.update(encoder._special_tokens)\n", | |
"\n", | |
" return vocab, merges\n", | |
"\n", | |
"def convert_tiktoken(model_name, output_dir=None):\n", | |
" if output_dir is None:\n", | |
" output_dir = model_name\n", | |
"\n", | |
" encoder = tiktoken.encoding_for_model(model_name)\n", | |
"\n", | |
" vocab, merges = generate_vocab_and_merges(encoder)\n", | |
"\n", | |
" added_tokens = [\n", | |
" {\n", | |
" \"id\": id,\n", | |
" \"content\": content,\n", | |
" \"single_word\": False,\n", | |
" \"lstrip\": False,\n", | |
" \"rstrip\": False,\n", | |
" \"normalized\": False,\n", | |
" \"special\": True,\n", | |
" }\n", | |
" for content, id in encoder._special_tokens.items()\n", | |
" ]\n", | |
"\n", | |
" # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer_config.json\n", | |
" tokenizer_config_template = {\n", | |
" \"add_prefix_space\": False,\n", | |
" \"bos_token\": \"<|endoftext|>\",\n", | |
" \"clean_up_tokenization_spaces\": False,\n", | |
" \"eos_token\": \"<|endoftext|>\",\n", | |
" \"unk_token\": \"<|endoftext|>\",\n", | |
" }\n", | |
" tokenizer_config_template.update(MODEL_INFO[model_name]) # Adds `model_max_length` and `tokenizer_class`\n", | |
" tokenizer_config_template = dict(sorted(tokenizer_config_template.items(), key=lambda x: x[0]))\n", | |
"\n", | |
" os.makedirs(output_dir, exist_ok=True)\n", | |
"\n", | |
" if MODEL_INFO[model_name]['tokenizer_class'] in ('GPT3_5Tokenizer', 'GPT4Tokenizer'):\n", | |
" pre_tokenizer = {\n", | |
" \"type\": \"Sequence\",\n", | |
" \"pretokenizers\": [\n", | |
" {\n", | |
" \"type\": \"Split\",\n", | |
" \"pattern\": {\n", | |
" \"Regex\": \"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\\\r\\\\n\\\\p{L}\\\\p{N}]?\\\\p{L}+|\\\\p{N}{1,3}| ?[^\\\\s\\\\p{L}\\\\p{N}]+[\\\\r\\\\n]*|\\\\s*[\\\\r\\\\n]+|\\\\s+(?!\\\\S)|\\\\s+\"\n", | |
" },\n", | |
" \"behavior\": \"Removed\",\n", | |
" \"invert\": True,\n", | |
" },\n", | |
" {\n", | |
" \"type\": \"ByteLevel\",\n", | |
" \"add_prefix_space\": False,\n", | |
" \"trim_offsets\": True,\n", | |
" \"use_regex\": False,\n", | |
" }\n", | |
" ]\n", | |
" }\n", | |
" else:\n", | |
" pre_tokenizer = {\n", | |
" \"type\": \"ByteLevel\",\n", | |
" \"add_prefix_space\": False,\n", | |
" \"trim_offsets\": True,\n", | |
" \"use_regex\": True,\n", | |
" }\n", | |
"\n", | |
" # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer.json\n", | |
" tokenizer_template = {\n", | |
" \"version\": \"1.0\",\n", | |
" \"truncation\": None,\n", | |
" \"padding\": None,\n", | |
" \"added_tokens\": added_tokens,\n", | |
" \"normalizer\": None,\n", | |
" \"pre_tokenizer\": pre_tokenizer,\n", | |
" \"post_processor\": None,\n", | |
" \"decoder\": {\n", | |
" \"type\": \"ByteLevel\",\n", | |
" \"add_prefix_space\": True,\n", | |
" \"trim_offsets\": True,\n", | |
" \"use_regex\": True,\n", | |
" },\n", | |
" \"model\": {\n", | |
" \"type\": \"BPE\",\n", | |
" \"dropout\": None,\n", | |
" \"unk_token\": None,\n", | |
" \"continuing_subword_prefix\": \"\",\n", | |
" \"end_of_word_suffix\": \"\",\n", | |
" \"fuse_unk\": False,\n", | |
" \"byte_fallback\": False,\n", | |
" \"vocab\": vocab,\n", | |
" \"merges\": merges,\n", | |
" },\n", | |
" }\n", | |
"\n", | |
"\n", | |
" # Save to files\n", | |
" with open(os.path.join(output_dir, 'vocab.json'), 'w', encoding='utf-8') as fp:\n", | |
" json.dump(vocab, fp, indent=2, ensure_ascii=False)\n", | |
"\n", | |
" with open(os.path.join(output_dir, 'tokenizer.json'), 'w', encoding='utf-8') as fp:\n", | |
" json.dump(tokenizer_template, fp, indent=2, ensure_ascii=False)\n", | |
"\n", | |
" with open(os.path.join(output_dir, 'tokenizer_config.json'), 'w', encoding='utf-8') as fp:\n", | |
" json.dump(tokenizer_config_template, fp, indent=2, ensure_ascii=False)\n", | |
"\n", | |
" with open(os.path.join(output_dir, 'special_tokens_map.json'), 'w', encoding='utf-8') as fp:\n", | |
" json.dump({\n", | |
" \"bos_token\": \"<|endoftext|>\",\n", | |
" \"eos_token\": \"<|endoftext|>\",\n", | |
" \"unk_token\": \"<|endoftext|>\",\n", | |
" }, fp, indent=2, ensure_ascii=False)\n", | |
"\n", | |
" with open(os.path.join(output_dir, 'merges.txt'), 'w', encoding='utf-8') as fp:\n", | |
" fp.write('#version: 0.2\\n')\n", | |
" fp.write('\\n'.join(merges))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"### Run conversion" | |
], | |
"metadata": { | |
"id": "wfuFCZRbFMT_" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "O87Zz6Vzhb5C" | |
}, | |
"outputs": [], | |
"source": [ | |
"output = 'models'\n", | |
"for model_name in MODEL_INFO:\n", | |
" convert_tiktoken(model_name, os.path.join(output, model_name))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "qx6tfE_UwFNB" | |
}, | |
"source": [ | |
"### Validation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "oSRUBMLmwatB" | |
}, | |
"outputs": [], | |
"source": [ | |
"# Tests adapted from https://github.com/openai/tiktoken/blob/1b9faf2779855124f05174adf1383e53689ed94b/tests/test_encoding.py\n", | |
"TESTS = [\n", | |
" \"\\n\\n\\n\\n\\ns1232\", \"hello world\", \"hello <|endoftext|>\", \"hello world\", \"hello <|endoftext|>\", \"0\", \"00\", \"000\", \"0000\", \"00000\", \"000000\", \"0000000\", \"00000000\", \"000000000\", \"0000000000\", \"00000000000\", \"000000000000\", \"0000000000000\", \"00000000000000\", \"000000000000000\", \"0000000000000000\", \"00000000000000000\", \"rer\", \"'rer\", \"today\\n \", \"today\\n \\n\", \"today\\n \\n\", \"hello world\", \"hello world\", \"hello world\", \" \\x850\", \"\", \"👍\", \" .\",\n", | |
"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from transformers import GPT2TokenizerFast, logging\n", | |
"\n", | |
"# Hide warning messages\n", | |
"logging.set_verbosity_error()\n", | |
"\n", | |
"output = 'models'\n", | |
"for model_name in MODEL_INFO:\n", | |
" print('Testing', model_name)\n", | |
" og_tokenizer = tiktoken.encoding_for_model(model_name)\n", | |
" hf_tokenizer = GPT2TokenizerFast.from_pretrained(os.path.join(output, model_name))\n", | |
"\n", | |
" for test in TESTS:\n", | |
" # Test encoding\n", | |
" og_tokens = og_tokenizer.encode(test, allowed_special={'<|endoftext|>'})\n", | |
" hf_tokens = hf_tokenizer.encode(test)\n", | |
" assert og_tokens == hf_tokens, f'ENCODE FAIL: \"{test}\". {og_tokens} != {hf_tokens}'\n", | |
"\n", | |
" # Test decoding\n", | |
" og_decoded = og_tokenizer.decode(og_tokens)\n", | |
" hf_decoded = hf_tokenizer.decode(hf_tokens)\n", | |
" assert og_decoded == hf_decoded, f'DECODE FAIL: \"{og_tokens}\". {og_decoded} != {hf_decoded}'\n" | |
], | |
"metadata": { | |
"id": "ELyGSJM0-yA4" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"colab": { | |
"provenance": [] | |
}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"name": "python3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment