thomwolf/add_special_tokens.py

## add_special_tokens.py
# We will use 5 special tokens:
# - <bos> to indicate the start of the sequence
# - <eos> to indicate the end of the sequence
# - <speaker1> to indicate the beginning and the tokens of an utterance from the user
# - <speaker2> to indicate the beginning and the tokens of an utterance from the bot
# - <pad> as a padding token to build batches of sequences
SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]

# We can add these special tokens to the vocabulary and the embeddings of the model:
tokenizer.set_special_tokens(SPECIAL_TOKENS)
model.set_num_special_tokens(len(SPECIAL_TOKENS))
	# We will use 5 special tokens:
	# - <bos> to indicate the start of the sequence
	# - <eos> to indicate the end of the sequence
	# - <speaker1> to indicate the beginning and the tokens of an utterance from the user
	# - <speaker2> to indicate the beginning and the tokens of an utterance from the bot
	# - <pad> as a padding token to build batches of sequences
	SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]

	# We can add these special tokens to the vocabulary and the embeddings of the model:
	tokenizer.set_special_tokens(SPECIAL_TOKENS)
	model.set_num_special_tokens(len(SPECIAL_TOKENS))