a7v8x/tokenization.py

## tokenization.py
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


max_length_test = 20
test_sentence = 'Test tokenization sentence. Followed by another sentence'

# add special tokens
test_sentence_with_special_tokens = '[CLS]' + test_sentence + '[SEP]'
tokenized = tokenizer.tokenize(test_sentence_with_special_tokens)
print('tokenized', tokenized)

# convert tokens to ids in WordPiece
input_ids = tokenizer.convert_tokens_to_ids(tokenized)

# precalculation of pad length, so that we can reuse it later on
padding_length = max_length_test - len(input_ids)

# map tokens to WordPiece dictionary and add pad token for those text shorter than our max length
input_ids = input_ids + ([0] * padding_length)

# attention should focus just on sequence with non padded tokens
attention_mask = [1] * len(input_ids)

# do not focus attention on padded tokens
attention_mask = attention_mask + ([0] * padding_length)

# token types, needed for example for question answering, for our purpose we will just set 0 as we have just one sequence
token_type_ids = [0] * max_length_test
bert_input = {
    "token_ids": input_ids,
    "token_type_ids": token_type_ids,
    "attention_mask": attention_mask
} print(bert_input)
	from transformers import BertTokenizer

	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


	max_length_test = 20
	test_sentence = 'Test tokenization sentence. Followed by another sentence'

	# add special tokens
	test_sentence_with_special_tokens = '[CLS]' + test_sentence + '[SEP]'
	tokenized = tokenizer.tokenize(test_sentence_with_special_tokens)
	print('tokenized', tokenized)

	# convert tokens to ids in WordPiece
	input_ids = tokenizer.convert_tokens_to_ids(tokenized)

	# precalculation of pad length, so that we can reuse it later on
	padding_length = max_length_test - len(input_ids)

	# map tokens to WordPiece dictionary and add pad token for those text shorter than our max length
	input_ids = input_ids + ([0] * padding_length)

	# attention should focus just on sequence with non padded tokens
	attention_mask = [1] * len(input_ids)

	# do not focus attention on padded tokens
	attention_mask = attention_mask + ([0] * padding_length)

	# token types, needed for example for question answering, for our purpose we will just set 0 as we have just one sequence
	token_type_ids = [0] * max_length_test
	bert_input = {
	"token_ids": input_ids,
	"token_type_ids": token_type_ids,
	"attention_mask": attention_mask
	} print(bert_input)