Last active
December 29, 2022 15:03
-
-
Save tenexcoder/85b38e17a5557f0bb7c44bda4a08271d to your computer and use it in GitHub Desktop.
Hugging Face BERT tokenizer from scratch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Hugging Face Tokenizers 0.9 - pip install tokenizers===0.9 | |
from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors | |
from tokenizers.normalizers import NFD, Lowercase, StripAccents | |
from tokenizers.models import WordPiece | |
from tokenizers.trainers import WordPieceTrainer | |
# Create a Toeknizer with an empty Unigram | |
bert_tokenizer = Tokenizer(WordPiece()) | |
# Our Tokenizer should start by aplaying NFD, Lowercase, StripAccents Normalization | |
bert_tokenizer.normalizer = normalizers.Sequence( | |
[NFD(), Lowercase(), StripAccents()]) | |
bert_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() | |
# Then we prepare our training paramaters | |
files = [ | |
f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]] | |
trainer = WordPieceTrainer( | |
vocab_size=30522, | |
# We set the special tokens we will use later | |
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] | |
) | |
bert_tokenizer.train(trainer, files) | |
# Finally we setup a template to apply the relevant special tokens | |
bert_tokenizer.post_processor = processors.TemplateProcessing( | |
single="[CLS] $A [SEP]", | |
pair="[CLS] $A [SEP] $B:1 [SEP]:1", | |
special_tokens=[ | |
("[CLS]", 1), | |
("[SEP]", 2), | |
], | |
) | |
# And we save our Tokenizer | |
model_files = bert_tokenizer.model.save("data", "bert-wiki") | |
bert_tokenizer.model = WordPiece.from_file(*model_files, unk_token="[UNK]") | |
bert_tokenizer.save("data/bert-wiki.json") | |
# And voila | |
bert_tokenizer = Tokenizer.from_file("data/bert-wiki.json") | |
print(bert_tokenizer.encode("Welcome to the 🤗 Tokenizers library.").tokens) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment