kusal1990/bert_model_ tokenizer.py

## bert_model_ tokenizer.py
from transformers import BertTokenizer, TFBertForMultipleChoice
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def arc_preprocessor(dataset, tokenizer):
    '''
    This function will convert a given context, question, choices in a format:

    [CLS] context [SEP] question choices[0] [SEP]
    [CLS] context [SEP] question choices[1] [SEP]
    [CLS] context [SEP] question choices[2] [SEP]
    [CLS] context [SEP] question choices[3] [SEP]

    After converting in this format the data will be tokenized using a given tokenizer.
    This function will return 4 arrays namely, input_ids, attention_mask, token_type_ids and labels.

    individual input_ids, token_type_ids, attention_mask shape will be as: [num_choices, max_seq_length]
    '''
    all_input_ids = []
    all_attention_mask = []
    all_token_type_ids = []

    for i in range(len(dataset)):

        context = dataset['context'].iloc[i]
        question = dataset['only_question'].iloc[i]
        options = dataset['options_list'].iloc[i]
        choice_features = []

        for j in range(len(options)):

            option = options[j]
            ques_opt = question + ' ' + f'{j}' + ' ' + option
            ques_opt = re.sub(r'\s+', ' ', ques_opt)

            inputs = tokenizer(context,
                               ques_opt,
                               max_length=MAX_LEN,
                               truncation='only_first',
                               padding='max_length')

            input_ids = inputs['input_ids']
            attention_mask = inputs['attention_mask']
            token_type_ids = inputs['token_type_ids']

            assert len(input_ids) == MAX_LEN
            assert len(token_type_ids) == MAX_LEN
            assert len(attention_mask) == MAX_LEN

            choice_features.append({'input_ids':input_ids,
                                    'attention_mask':attention_mask,
                                    'token_type_ids':token_type_ids})

        all_input_ids.append(np.asarray([cf['input_ids'] for cf in choice_features], dtype='int32'))
        all_attention_mask.append(np.asarray([cf['attention_mask'] for cf in choice_features], dtype='int32'))
        all_token_type_ids.append(np.asarray([cf['token_type_ids'] for cf in choice_features], dtype='int32'))

    return all_input_ids, all_attention_mask, all_token_type_ids
	from transformers import BertTokenizer, TFBertForMultipleChoice
	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

	def arc_preprocessor(dataset, tokenizer):
	'''
	This function will convert a given context, question, choices in a format:

	[CLS] context [SEP] question choices[0] [SEP]
	[CLS] context [SEP] question choices[1] [SEP]
	[CLS] context [SEP] question choices[2] [SEP]
	[CLS] context [SEP] question choices[3] [SEP]

	After converting in this format the data will be tokenized using a given tokenizer.
	This function will return 4 arrays namely, input_ids, attention_mask, token_type_ids and labels.

	individual input_ids, token_type_ids, attention_mask shape will be as: [num_choices, max_seq_length]
	'''
	all_input_ids = []
	all_attention_mask = []
	all_token_type_ids = []

	for i in range(len(dataset)):

	context = dataset['context'].iloc[i]
	question = dataset['only_question'].iloc[i]
	options = dataset['options_list'].iloc[i]
	choice_features = []

	for j in range(len(options)):

	option = options[j]
	ques_opt = question + ' ' + f'{j}' + ' ' + option
	ques_opt = re.sub(r'\s+', ' ', ques_opt)

	inputs = tokenizer(context,
	ques_opt,
	max_length=MAX_LEN,
	truncation='only_first',
	padding='max_length')

	input_ids = inputs['input_ids']
	attention_mask = inputs['attention_mask']
	token_type_ids = inputs['token_type_ids']

	assert len(input_ids) == MAX_LEN
	assert len(token_type_ids) == MAX_LEN
	assert len(attention_mask) == MAX_LEN

	choice_features.append({'input_ids':input_ids,
	'attention_mask':attention_mask,
	'token_type_ids':token_type_ids})

	all_input_ids.append(np.asarray([cf['input_ids'] for cf in choice_features], dtype='int32'))
	all_attention_mask.append(np.asarray([cf['attention_mask'] for cf in choice_features], dtype='int32'))
	all_token_type_ids.append(np.asarray([cf['token_type_ids'] for cf in choice_features], dtype='int32'))

	return all_input_ids, all_attention_mask, all_token_type_ids