Skip to content

Instantly share code, notes, and snippets.

@ebsmothers
Created April 4, 2024 20:19
Show Gist options
  • Save ebsmothers/54b133dd87db6679b14318545aaa2de4 to your computer and use it in GitHub Desktop.
Save ebsmothers/54b133dd87db6679b14318545aaa2de4 to your computer and use it in GitHub Desktop.
from tiktoken._educational import SimpleBytePairEncoding
from tiktoken.load import dump_tiktoken_bpe
# copy-paste from torchtune/modules/tokenizers/_tiktoken.py
pattern = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
# in_path corresponds to https://github.com/google/sentencepiece/blob/master/data/botchan.txt
with open(in_path, 'r') as f:
data = f.read()
enc = SimpleBytePairEncoding.train(data, vocab_size=2000, pat_str=pattern)
dump_tiktoken_bpe(enc.mergeable_ranks, out_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment