Skip to content

Instantly share code, notes, and snippets.

@kusal1990
Created June 25, 2022 06:31
Show Gist options
  • Save kusal1990/793eeb06cc6094815e766987f0182900 to your computer and use it in GitHub Desktop.
Save kusal1990/793eeb06cc6094815e766987f0182900 to your computer and use it in GitHub Desktop.
from transformers import BertTokenizer, TFBertForMultipleChoice
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
def arc_preprocessor(dataset, tokenizer):
'''
This function will convert a given context, question, choices in a format:
[CLS] context [SEP] question choices[0] [SEP]
[CLS] context [SEP] question choices[1] [SEP]
[CLS] context [SEP] question choices[2] [SEP]
[CLS] context [SEP] question choices[3] [SEP]
After converting in this format the data will be tokenized using a given tokenizer.
This function will return 4 arrays namely, input_ids, attention_mask, token_type_ids and labels.
individual input_ids, token_type_ids, attention_mask shape will be as: [num_choices, max_seq_length]
'''
all_input_ids = []
all_attention_mask = []
all_token_type_ids = []
for i in range(len(dataset)):
context = dataset['context'].iloc[i]
question = dataset['only_question'].iloc[i]
options = dataset['options_list'].iloc[i]
choice_features = []
for j in range(len(options)):
option = options[j]
ques_opt = question + ' ' + f'{j}' + ' ' + option
ques_opt = re.sub(r'\s+', ' ', ques_opt)
inputs = tokenizer(context,
ques_opt,
max_length=MAX_LEN,
truncation='only_first',
padding='max_length')
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
token_type_ids = inputs['token_type_ids']
assert len(input_ids) == MAX_LEN
assert len(token_type_ids) == MAX_LEN
assert len(attention_mask) == MAX_LEN
choice_features.append({'input_ids':input_ids,
'attention_mask':attention_mask,
'token_type_ids':token_type_ids})
all_input_ids.append(np.asarray([cf['input_ids'] for cf in choice_features], dtype='int32'))
all_attention_mask.append(np.asarray([cf['attention_mask'] for cf in choice_features], dtype='int32'))
all_token_type_ids.append(np.asarray([cf['token_type_ids'] for cf in choice_features], dtype='int32'))
return all_input_ids, all_attention_mask, all_token_type_ids
@kusal1990
Copy link
Author

ok

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment