Skip to content

Instantly share code, notes, and snippets.

@jamescalam
Last active March 2, 2023 16:54
Show Gist options
  • Save jamescalam/4c0939b436ed607a3a48b35999607b4c to your computer and use it in GitHub Desktop.
Save jamescalam/4c0939b436ed607a3a48b35999607b4c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "ml",
"display_name": "ML",
"language": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"\n",
"labels = torch.tensor([x.ids for x in batch])\n",
"mask = torch.tensor([x.attention_mask for x in batch])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# make copy of labels tensor, this will be input_ids\n",
"input_ids = labels.detach().clone()\n",
"# create random array of floats with equal dims to input_ids\n",
"rand = torch.rand(input_ids.shape)\n",
"# mask random 15% where token is not 0 [PAD], 1 [CLS], or 2 [SEP]\n",
"mask_arr = (rand < .15) * (input_ids != 0) * (input_ids != 1) * (input_ids != 2)\n",
"# loop through each row in input_ids tensor (cannot do in parallel)\n",
"for i in range(input_ids.shape[0]):\n",
" # get indices of mask positions from mask array\n",
" selection = torch.flatten(mask_arr[i].nonzero()).tolist()\n",
" # mask input_ids\n",
" input_ids[i, selection] = 3 # our custom [MASK] token == 3"
]
},
{
"source": [
"We have 10000 tokenized sequences, each containing 512 tokens."
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"torch.Size([10000, 512])"
]
},
"metadata": {},
"execution_count": 8
}
],
"source": [
"input_ids.shape"
]
},
{
"source": [
"We can see the special tokens here, `1` is our **\\[CLS\\]** token, `2` our **\\[SEP\\]** token, `3` our **\\[MASK\\]** token, and at the end we have two `0` - or **\\[PAD\\]** - tokens."
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"tensor([ 1, 693, 18623, 1358, 7752, 3, 1056, 280, 3, 6321,\n",
" 776, 3, 2145, 280, 11, 10205, 3778, 1266, 3, 1197,\n",
" 3, 1142, 10293, 30, 552, 3, 1340, 16, 385, 3,\n",
" 458, 9777, 5942, 376, 25475, 2870, 1201, 391, 2691, 421,\n",
" 17927, 16996, 739, 3, 3, 22814, 376, 7950, 17824, 980,\n",
" 435, 18388, 1475, 3, 3, 391, 37, 24909, 739, 2689,\n",
" 27869, 275, 5803, 625, 770, 13459, 483, 4779, 275, 12870,\n",
" 532, 18, 680, 3867, 24138, 376, 7752, 17630, 18623, 1134,\n",
" 8882, 269, 431, 287, 12450, 3, 8041, 6056, 275, 5286,\n",
" 18, 11755, 3, 275, 6161, 317, 10528, 3, 3, 13181,\n",
" 18, 458, 3, 372, 456, 2150, 12054, 16, 3, 317,\n",
" 6122, 5324, 3329, 570, 1594, 13181, 280, 14634, 18, 763,\n",
" 3, 6323, 2484, 6544, 5085, 469, 9106, 18, 680, 3,\n",
" 842, 1518, 25737, 3653, 303, 3300, 306, 3063, 292, 3,\n",
" 18, 381, 330, 2872, 343, 4722, 3, 16, 16848, 267,\n",
" 5216, 317, 1009, 842, 1518, 16, 3, 338, 330, 2757,\n",
" 435, 3653, 27081, 10965, 12, 39, 13, 3, 1865, 17,\n",
" 5580, 1056, 992, 363, 3, 360, 94, 1182, 589, 1729,\n",
" 3, 3, 351, 12863, 300, 3, 5240, 3, 3, 10799,\n",
" 480, 2261, 3, 421, 14591, 3, 18, 2, 0, 0])"
]
},
"metadata": {},
"execution_count": 9
}
],
"source": [
"input_ids[0][:200]"
]
}
]
}
@noreun
Copy link

noreun commented Mar 2, 2023

It seems that the API has changed and now RobertaTokenizer is returning a dict:

print(len(batch)) print(len(batch['input_ids'])) print(len(batch['attention_mask']))

2 10000 10000
I updated this:

labels = torch.tensor(batch['input_ids']) mask = torch.tensor(batch['attention_mask'])
and it runs, but the predictions are garbage, it is even predicting a mask token. The special tokens seem to have changed values too:

i = 0 print(batch['input_ids'][0])

[0, 692, 18622, 1357, 7751, 292, 1055, 280, 7404, 6320, 775, 725, 2144, 280, 11, 10204, 3777, 1265, 1809, 1196, 603, 1141, 10292, 30, 551, 267, 1339, 16, 385, 3374, 458, 9776, 5941, 376, 25474, 2869, 1200, 391, 2690, 421, 17926, 16995, 738, 305, 306, 22813, 376, 7949, 17823, 979, 435, 18387, 1474, 275, 2596, 391, 37, 24908, 738, 2688, 27868, 275, 5802, 624, 769, 13458, 483, 4778, 275, 12869, 532, 18, 679, 3866, 24137, 376, 7751, 17629, 18622, 1133, 8881, 269, 431, 287, 12449, 483, 8040, 6055, 275, 5285, 18, 11754, 367, 275, 6160, 317, 10527, 569, 1593, 13180, 18, 458, 16, 372, 456, 2149, 12053, 16, 500, 317, 6121, 5323, 3328, 569, 1593, 13180, 280, 14633, 18, 762, 12655, 6322, 2483, 6543, 5084, 469, 9105, 18, 679, 1008, 841, 1517, 25736, 3652, 303, 3299, 306, 3062, 292, 15163, 18, 381, 330, 2871, 343, 4721, 316, 16, 16847, 267, 5215, 317, 1008, 841, 1517, 16, 4815, 338, 330, 2756, 435, 3652, 27080, 10964, 12, 39, 13, 18714, 1864, 17, 5579, 1055, 991, 363, 18, 360, 94, 1181, 588, 1728, 841, 343, 351, 12862, 300, 841, 5239, 16, 5617, 10798, 480, 2260, 3606, 421, 14590, 16995, 18, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

I wonder if something else also changed and is impacting the tokenization and the learning by consequence...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment