Last active
March 2, 2023 16:54
-
-
Save jamescalam/4c0939b436ed607a3a48b35999607b4c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.5" | |
}, | |
"orig_nbformat": 2, | |
"kernelspec": { | |
"name": "ml", | |
"display_name": "ML", | |
"language": "python" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2, | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import torch\n", | |
"\n", | |
"labels = torch.tensor([x.ids for x in batch])\n", | |
"mask = torch.tensor([x.attention_mask for x in batch])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# make copy of labels tensor, this will be input_ids\n", | |
"input_ids = labels.detach().clone()\n", | |
"# create random array of floats with equal dims to input_ids\n", | |
"rand = torch.rand(input_ids.shape)\n", | |
"# mask random 15% where token is not 0 [PAD], 1 [CLS], or 2 [SEP]\n", | |
"mask_arr = (rand < .15) * (input_ids != 0) * (input_ids != 1) * (input_ids != 2)\n", | |
"# loop through each row in input_ids tensor (cannot do in parallel)\n", | |
"for i in range(input_ids.shape[0]):\n", | |
" # get indices of mask positions from mask array\n", | |
" selection = torch.flatten(mask_arr[i].nonzero()).tolist()\n", | |
" # mask input_ids\n", | |
" input_ids[i, selection] = 3 # our custom [MASK] token == 3" | |
] | |
}, | |
{ | |
"source": [ | |
"We have 10000 tokenized sequences, each containing 512 tokens." | |
], | |
"cell_type": "markdown", | |
"metadata": {} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"torch.Size([10000, 512])" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 8 | |
} | |
], | |
"source": [ | |
"input_ids.shape" | |
] | |
}, | |
{ | |
"source": [ | |
"We can see the special tokens here, `1` is our **\\[CLS\\]** token, `2` our **\\[SEP\\]** token, `3` our **\\[MASK\\]** token, and at the end we have two `0` - or **\\[PAD\\]** - tokens." | |
], | |
"cell_type": "markdown", | |
"metadata": {} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"tensor([ 1, 693, 18623, 1358, 7752, 3, 1056, 280, 3, 6321,\n", | |
" 776, 3, 2145, 280, 11, 10205, 3778, 1266, 3, 1197,\n", | |
" 3, 1142, 10293, 30, 552, 3, 1340, 16, 385, 3,\n", | |
" 458, 9777, 5942, 376, 25475, 2870, 1201, 391, 2691, 421,\n", | |
" 17927, 16996, 739, 3, 3, 22814, 376, 7950, 17824, 980,\n", | |
" 435, 18388, 1475, 3, 3, 391, 37, 24909, 739, 2689,\n", | |
" 27869, 275, 5803, 625, 770, 13459, 483, 4779, 275, 12870,\n", | |
" 532, 18, 680, 3867, 24138, 376, 7752, 17630, 18623, 1134,\n", | |
" 8882, 269, 431, 287, 12450, 3, 8041, 6056, 275, 5286,\n", | |
" 18, 11755, 3, 275, 6161, 317, 10528, 3, 3, 13181,\n", | |
" 18, 458, 3, 372, 456, 2150, 12054, 16, 3, 317,\n", | |
" 6122, 5324, 3329, 570, 1594, 13181, 280, 14634, 18, 763,\n", | |
" 3, 6323, 2484, 6544, 5085, 469, 9106, 18, 680, 3,\n", | |
" 842, 1518, 25737, 3653, 303, 3300, 306, 3063, 292, 3,\n", | |
" 18, 381, 330, 2872, 343, 4722, 3, 16, 16848, 267,\n", | |
" 5216, 317, 1009, 842, 1518, 16, 3, 338, 330, 2757,\n", | |
" 435, 3653, 27081, 10965, 12, 39, 13, 3, 1865, 17,\n", | |
" 5580, 1056, 992, 363, 3, 360, 94, 1182, 589, 1729,\n", | |
" 3, 3, 351, 12863, 300, 3, 5240, 3, 3, 10799,\n", | |
" 480, 2261, 3, 421, 14591, 3, 18, 2, 0, 0])" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 9 | |
} | |
], | |
"source": [ | |
"input_ids[0][:200]" | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It seems that the API has changed and now RobertaTokenizer is returning a dict:
print(len(batch)) print(len(batch['input_ids'])) print(len(batch['attention_mask']))
2 10000 10000
I updated this:
labels = torch.tensor(batch['input_ids']) mask = torch.tensor(batch['attention_mask'])
and it runs, but the predictions are garbage, it is even predicting a mask token. The special tokens seem to have changed values too:
i = 0 print(batch['input_ids'][0])
[0, 692, 18622, 1357, 7751, 292, 1055, 280, 7404, 6320, 775, 725, 2144, 280, 11, 10204, 3777, 1265, 1809, 1196, 603, 1141, 10292, 30, 551, 267, 1339, 16, 385, 3374, 458, 9776, 5941, 376, 25474, 2869, 1200, 391, 2690, 421, 17926, 16995, 738, 305, 306, 22813, 376, 7949, 17823, 979, 435, 18387, 1474, 275, 2596, 391, 37, 24908, 738, 2688, 27868, 275, 5802, 624, 769, 13458, 483, 4778, 275, 12869, 532, 18, 679, 3866, 24137, 376, 7751, 17629, 18622, 1133, 8881, 269, 431, 287, 12449, 483, 8040, 6055, 275, 5285, 18, 11754, 367, 275, 6160, 317, 10527, 569, 1593, 13180, 18, 458, 16, 372, 456, 2149, 12053, 16, 500, 317, 6121, 5323, 3328, 569, 1593, 13180, 280, 14633, 18, 762, 12655, 6322, 2483, 6543, 5084, 469, 9105, 18, 679, 1008, 841, 1517, 25736, 3652, 303, 3299, 306, 3062, 292, 15163, 18, 381, 330, 2871, 343, 4721, 316, 16, 16847, 267, 5215, 317, 1008, 841, 1517, 16, 4815, 338, 330, 2756, 435, 3652, 27080, 10964, 12, 39, 13, 18714, 1864, 17, 5579, 1055, 991, 363, 18, 360, 94, 1181, 588, 1728, 841, 343, 351, 12862, 300, 841, 5239, 16, 5617, 10798, 480, 2260, 3606, 421, 14590, 16995, 18, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
I wonder if something else also changed and is impacting the tokenization and the learning by consequence...