jamescalam/create_tensors.ipynb

## create_tensors.ipynb
{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "ml",
   "display_name": "ML",
   "language": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "\n",
    "labels = torch.tensor([x.ids for x in batch])\n",
    "mask = torch.tensor([x.attention_mask for x in batch])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make copy of labels tensor, this will be input_ids\n",
    "input_ids = labels.detach().clone()\n",
    "# create random array of floats with equal dims to input_ids\n",
    "rand = torch.rand(input_ids.shape)\n",
    "# mask random 15% where token is not 0 [PAD], 1 [CLS], or 2 [SEP]\n",
    "mask_arr = (rand < .15) * (input_ids != 0) * (input_ids != 1) * (input_ids != 2)\n",
    "# loop through each row in input_ids tensor (cannot do in parallel)\n",
    "for i in range(input_ids.shape[0]):\n",
    "    # get indices of mask positions from mask array\n",
    "    selection = torch.flatten(mask_arr[i].nonzero()).tolist()\n",
    "    # mask input_ids\n",
    "    input_ids[i, selection] = 3  # our custom [MASK] token == 3"
   ]
  },
  {
   "source": [
    "We have 10000 tokenized sequences, each containing 512 tokens."
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "torch.Size([10000, 512])"
      ]
     },
     "metadata": {},
     "execution_count": 8
    }
   ],
   "source": [
    "input_ids.shape"
   ]
  },
  {
   "source": [
    "We can see the special tokens here, `1` is our **\\[CLS\\]** token, `2` our **\\[SEP\\]** token, `3` our **\\[MASK\\]** token, and at the end we have two `0` - or **\\[PAD\\]** - tokens."
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "tensor([    1,   693, 18623,  1358,  7752,     3,  1056,   280,     3,  6321,\n",
       "          776,     3,  2145,   280,    11, 10205,  3778,  1266,     3,  1197,\n",
       "            3,  1142, 10293,    30,   552,     3,  1340,    16,   385,     3,\n",
       "          458,  9777,  5942,   376, 25475,  2870,  1201,   391,  2691,   421,\n",
       "        17927, 16996,   739,     3,     3, 22814,   376,  7950, 17824,   980,\n",
       "          435, 18388,  1475,     3,     3,   391,    37, 24909,   739,  2689,\n",
       "        27869,   275,  5803,   625,   770, 13459,   483,  4779,   275, 12870,\n",
       "          532,    18,   680,  3867, 24138,   376,  7752, 17630, 18623,  1134,\n",
       "         8882,   269,   431,   287, 12450,     3,  8041,  6056,   275,  5286,\n",
       "           18, 11755,     3,   275,  6161,   317, 10528,     3,     3, 13181,\n",
       "           18,   458,     3,   372,   456,  2150, 12054,    16,     3,   317,\n",
       "         6122,  5324,  3329,   570,  1594, 13181,   280, 14634,    18,   763,\n",
       "            3,  6323,  2484,  6544,  5085,   469,  9106,    18,   680,     3,\n",
       "          842,  1518, 25737,  3653,   303,  3300,   306,  3063,   292,     3,\n",
       "           18,   381,   330,  2872,   343,  4722,     3,    16, 16848,   267,\n",
       "         5216,   317,  1009,   842,  1518,    16,     3,   338,   330,  2757,\n",
       "          435,  3653, 27081, 10965,    12,    39,    13,     3,  1865,    17,\n",
       "         5580,  1056,   992,   363,     3,   360,    94,  1182,   589,  1729,\n",
       "            3,     3,   351, 12863,   300,     3,  5240,     3,     3, 10799,\n",
       "          480,  2261,     3,   421, 14591,     3,    18,     2,     0,     0])"
      ]
     },
     "metadata": {},
     "execution_count": 9
    }
   ],
   "source": [
    "input_ids[0][:200]"
   ]
  }
 ]
}
	{
	"metadata": {
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.5"
	},
	"orig_nbformat": 2,
	"kernelspec": {
	"name": "ml",
	"display_name": "ML",
	"language": "python"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2,
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"import torch\n",
	"\n",
	"labels = torch.tensor([x.ids for x in batch])\n",
	"mask = torch.tensor([x.attention_mask for x in batch])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"# make copy of labels tensor, this will be input_ids\n",
	"input_ids = labels.detach().clone()\n",
	"# create random array of floats with equal dims to input_ids\n",
	"rand = torch.rand(input_ids.shape)\n",
	"# mask random 15% where token is not 0 [PAD], 1 [CLS], or 2 [SEP]\n",
	"mask_arr = (rand < .15) * (input_ids != 0) * (input_ids != 1) * (input_ids != 2)\n",
	"# loop through each row in input_ids tensor (cannot do in parallel)\n",
	"for i in range(input_ids.shape[0]):\n",
	" # get indices of mask positions from mask array\n",
	" selection = torch.flatten(mask_arr[i].nonzero()).tolist()\n",
	" # mask input_ids\n",
	" input_ids[i, selection] = 3 # our custom [MASK] token == 3"
	]
	},
	{
	"source": [
	"We have 10000 tokenized sequences, each containing 512 tokens."
	],
	"cell_type": "markdown",
	"metadata": {}
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"torch.Size([10000, 512])"
	]
	},
	"metadata": {},
	"execution_count": 8
	}
	],
	"source": [
	"input_ids.shape"
	]
	},
	{
	"source": [
	"We can see the special tokens here, `1` is our \\[CLS\\] token, `2` our \\[SEP\\] token, `3` our \\[MASK\\] token, and at the end we have two `0` - or \\[PAD\\] - tokens."
	],
	"cell_type": "markdown",
	"metadata": {}
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"tensor([ 1, 693, 18623, 1358, 7752, 3, 1056, 280, 3, 6321,\n",
	" 776, 3, 2145, 280, 11, 10205, 3778, 1266, 3, 1197,\n",
	" 3, 1142, 10293, 30, 552, 3, 1340, 16, 385, 3,\n",
	" 458, 9777, 5942, 376, 25475, 2870, 1201, 391, 2691, 421,\n",
	" 17927, 16996, 739, 3, 3, 22814, 376, 7950, 17824, 980,\n",
	" 435, 18388, 1475, 3, 3, 391, 37, 24909, 739, 2689,\n",
	" 27869, 275, 5803, 625, 770, 13459, 483, 4779, 275, 12870,\n",
	" 532, 18, 680, 3867, 24138, 376, 7752, 17630, 18623, 1134,\n",
	" 8882, 269, 431, 287, 12450, 3, 8041, 6056, 275, 5286,\n",
	" 18, 11755, 3, 275, 6161, 317, 10528, 3, 3, 13181,\n",
	" 18, 458, 3, 372, 456, 2150, 12054, 16, 3, 317,\n",
	" 6122, 5324, 3329, 570, 1594, 13181, 280, 14634, 18, 763,\n",
	" 3, 6323, 2484, 6544, 5085, 469, 9106, 18, 680, 3,\n",
	" 842, 1518, 25737, 3653, 303, 3300, 306, 3063, 292, 3,\n",
	" 18, 381, 330, 2872, 343, 4722, 3, 16, 16848, 267,\n",
	" 5216, 317, 1009, 842, 1518, 16, 3, 338, 330, 2757,\n",
	" 435, 3653, 27081, 10965, 12, 39, 13, 3, 1865, 17,\n",
	" 5580, 1056, 992, 363, 3, 360, 94, 1182, 589, 1729,\n",
	" 3, 3, 351, 12863, 300, 3, 5240, 3, 3, 10799,\n",
	" 480, 2261, 3, 421, 14591, 3, 18, 2, 0, 0])"
	]
	},
	"metadata": {},
	"execution_count": 9
	}
	],
	"source": [
	"input_ids[0][:200]"
	]
	}
	]
	}