Skip to content

Instantly share code, notes, and snippets.

@kayush2O6
Created August 14, 2019 18:04
Show Gist options
  • Save kayush2O6/162d7015b6e44f31d7bd424e881d4be4 to your computer and use it in GitHub Desktop.
Save kayush2O6/162d7015b6e44f31d7bd424e881d4be4 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# map the excution to a specific gpu device\n",
"import os\n",
"os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"\n",
"os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 402
},
"colab_type": "code",
"id": "lhCbMQFVLqTl",
"outputId": "323d6c31-4a87-4559-bb91-81cb69db2108"
},
"outputs": [],
"source": [
"#@title enviornment sanity check only on Google Colab otherwise comment it out.\n",
"\n",
"\n",
"import pynvml\n",
"\n",
"pynvml.nvmlInit()\n",
"handle = pynvml.nvmlDeviceGetHandleByIndex(0)\n",
"device_name = pynvml.nvmlDeviceGetName(handle)\n",
"\n",
"if device_name != b'Tesla T4':\n",
" raise Exception(\"\"\"\n",
" Unfortunately Colab didn't give you a T4 GPU.\n",
" \n",
" Make sure you've configured Colab to request a GPU instance type.\n",
" \n",
" If you get a K80 GPU, try Runtime -> Reset all runtimes...\n",
" \"\"\")\n",
"else:\n",
" print('*********************************************')\n",
" print('Woo! Your instance has the right kind of GPU!')\n",
" print('*********************************************')\n",
" print()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "DYgrJRnBMIh5"
},
"outputs": [],
"source": [
"!wget -nc https://github.com/rapidsai/notebooks-extended/raw/master/utils/rapids-colab.sh\n",
"!bash rapids-colab.sh\n",
"\n",
"import sys, os\n",
"\n",
"sys.path.append('/usr/local/lib/python3.6/site-packages/')\n",
"os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'\n",
"os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "F5ToklT8LgoX",
"outputId": "ed57a23e-708a-457d-89f7-7a3259ee8350"
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import nvstrings\n",
"import nvcategory\n",
"import cudf\n",
"import numpy as np\n",
"from numba import cuda, float32\n",
"import ctypes\n",
"import math\n",
"import cupy\n",
"import time \n",
"\n",
"from torch import nn, optim\n",
"from torch.autograd import Variable\n",
"import torch\n",
"from torch.utils.data import TensorDataset, DataLoader\n",
"torch.cuda.is_available()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 207
},
"colab_type": "code",
"collapsed": true,
"id": "Af2FjgEAMSj5",
"jupyter": {
"outputs_hidden": true
},
"outputId": "17b93035-6922-4906-89ff-dfce90729a9c"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Archive: glove.6B.zip\n",
" inflating: glove.6B.50d.txt \n",
" inflating: glove.6B.100d.txt \n",
" inflating: glove.6B.200d.txt \n",
" inflating: glove.6B.300d.txt \n",
"the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581\n",
", 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 -0.42852 -0.55641 -0.364 -0.23938 0.13001 -0.063734 -0.39575 -0.48162 0.23291 0.090201 -0.13324 0.078639 -0.41634 -0.15428 0.10068 0.48891 0.31226 -0.1252 -0.037512 -1.5179 0.12612 -0.02442 -0.042961 -0.28351 3.5416 -0.11956 -0.014533 -0.1499 0.21864 -0.33412 -0.13872 0.31806 0.70358 0.44858 -0.080262 0.63003 0.32111 -0.46765 0.22786 0.36034 -0.37818 -0.56657 0.044691 0.30392\n",
". 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973 -0.43478 -0.31086 -0.44999 -0.29486 0.16608 0.11963 -0.41328 -0.42353 0.59868 0.28825 -0.11547 -0.041848 -0.67989 -0.25063 0.18472 0.086876 0.46582 0.015035 0.043474 -1.4671 -0.30384 -0.023441 0.30589 -0.21785 3.746 0.0042284 -0.18436 -0.46209 0.098329 -0.11907 0.23919 0.1161 0.41705 0.056763 -6.3681e-05 0.068987 0.087939 -0.10285 -0.13931 0.22314 -0.080803 -0.35652 0.016413 0.10216\n",
"of 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 0.18157 -0.52393 0.10381 -0.17566 0.078852 -0.36216 -0.11829 -0.83336 0.11917 -0.16605 0.061555 -0.012719 -0.56623 0.013616 0.22851 -0.14396 -0.067549 -0.38157 -0.23698 -1.7037 -0.86692 -0.26704 -0.2589 0.1767 3.8676 -0.1613 -0.13273 -0.68881 0.18444 0.0052464 -0.33874 -0.078956 0.24185 0.36576 -0.34727 0.28483 0.075693 -0.062178 -0.38988 0.22902 -0.21617 -0.22562 -0.093918 -0.80375\n",
"to 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 -0.41376 0.13228 -0.29847 -0.085253 0.17118 0.22419 -0.10046 -0.43653 0.33418 0.67846 0.057204 -0.34448 -0.42785 -0.43275 0.55963 0.10032 0.18677 -0.26854 0.037334 -2.0932 0.22171 -0.39868 0.20912 -0.55725 3.8826 0.47466 -0.95658 -0.37788 0.20869 -0.32752 0.12751 0.088359 0.16351 -0.21634 -0.094375 0.018324 0.21048 -0.03088 -0.19722 0.082279 -0.09434 -0.073297 -0.064699 -0.26044\n"
]
}
],
"source": [
"# Downloading the glove word vectors\n",
"!wget --quiet http://nlp.stanford.edu/data/glove.6B.zip\n",
"!unzip glove.6B.zip\n",
"!head -1 glove.6B.50d.txt"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "6i8WqBamUbhx"
},
"source": [
"\n",
"\n",
"---\n",
"\n",
"\n",
"\n",
"---\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "jkVx5bguUOxR"
},
"source": [
"Loading the pretrained word vectors in `pre_df` dataframe"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "Imi9PV41Lgoc",
"outputId": "d031dac2-951d-445c-d36b-cb909f1c3d09"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(400000, 300)\n",
"float32\n",
"0 the\n",
"1 ,\n",
"2 .\n",
"3 of\n",
"4 to\n",
"5 and\n",
"6 in\n",
"7 a\n",
"8 \"\n",
"9 's\n",
"10 for\n",
"11 -\n",
"12 that\n",
"13 on\n",
"14 is\n",
"15 was\n",
"16 said\n",
"17 with\n",
"18 he\n",
"19 as\n",
"Name: 0, dtype: object\n"
]
}
],
"source": [
"pre_df = cudf.read_csv(\"glove.6B.300d.txt\", header=None, delim_whitespace=True, quoting=3) #ignore quoting\n",
"\n",
"mappings = pre_df['0']\n",
"\n",
"pre_df.drop_column('0')\n",
"for c in pre_df.columns:\n",
" pre_df[c] = pre_df[c].astype(np.float32)\n",
"mat = pre_df.as_gpu_matrix()\n",
"\n",
"print(mat.shape)\n",
"print(mat.dtype)\n",
"print(mappings.head(20))\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "OE8aX1-EUko-"
},
"source": [
"**gpu implementaion of cosine similarity** to find the nearest the word with respect to each word."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "sUkyYS0nLgoe"
},
"outputs": [],
"source": [
"@cuda.jit(device=True)\n",
"def dot(a, b, dim_size):\n",
" summ = 0\n",
" for i in range(dim_size):\n",
" summ += (a[i]*b[i])\n",
" return summ\n",
"\n",
"@cuda.jit(device=True)\n",
"def cosine_sim(a, b, dim_size):\n",
" return dot(a,b, dim_size) / ( math.sqrt(dot(a, a, dim_size)) * math.sqrt(dot(b, b, dim_size)) )\n",
"\n",
"@cuda.jit('void(float32[:,:], int32[:], int32, int32)')\n",
"def find_nearest(mat, out, dim_size, n):\n",
" idx = cuda.threadIdx.x + cuda.blockDim.x * cuda.blockIdx.x\n",
" if idx >= n:\n",
" return\n",
" c = -1.0 \n",
" c_i = idx\n",
" # here is room for improvement using shared memory \n",
" for i in range(n):\n",
" if i == idx:\n",
" continue\n",
" csim = cosine_sim(mat[idx], mat[i], dim_size)\n",
" if csim >= c:\n",
" c_i = i\n",
" c = csim\n",
" \n",
" out[idx] = c_i\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "j_uN3yJOVDfH"
},
"source": [
"Invoking the `find_similarity()` kernel with appropriate configurations (**bpg**:block _per_grid, **tbp**:thread_per_block )"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "2cx8E8ybLgog",
"outputId": "6ef8ccd1-8c39-425f-e5d3-36898124eefc"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"kernel launch configuraion: 128 3125\n",
"time taken 13.944092090924581 mins\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>word</th>\n",
" <th>nearest</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>the</td>\n",
" <td>of</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>,</td>\n",
" <td>the</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>.</td>\n",
" <td>.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>of</td>\n",
" <td>the</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>to</td>\n",
" <td>,</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>and</td>\n",
" <td>the</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>in</td>\n",
" <td>the</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>a</td>\n",
" <td>the</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>\"</td>\n",
" <td>help</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>'s</td>\n",
" <td>the</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>for</td>\n",
" <td>both</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>-</td>\n",
" <td>the</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>that</td>\n",
" <td>where</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>on</td>\n",
" <td>the</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>is</td>\n",
" <td>another</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>was</td>\n",
" <td>the</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>said</td>\n",
" <td>“</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>with</td>\n",
" <td>the</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>he</td>\n",
" <td>’s</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>as</td>\n",
" <td>the</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" word nearest\n",
"0 the of\n",
"1 , the\n",
"2 . .\n",
"3 of the\n",
"4 to ,\n",
"5 and the\n",
"6 in the\n",
"7 a the\n",
"8 \" help\n",
"9 's the\n",
"10 for both\n",
"11 - the\n",
"12 that where\n",
"13 on the\n",
"14 is another\n",
"15 was the\n",
"16 said “\n",
"17 with the\n",
"18 he ’s\n",
"19 as the"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n = mat.shape[0]\n",
"dim_size = mat.shape[1]\n",
"device = cuda.get_current_device()\n",
"\n",
"tpb = 128 #device.WARP_SIZE #blocksize or thread per block\n",
"bpg = int(np.ceil((n)/tpb)) # block per grid\n",
"print( \"kernel launch configuraion: \", tpb, bpg)\n",
"\n",
"out = cuda.device_array(shape=n, dtype=np.int32)\n",
"\n",
"st = time.time()\n",
"find_nearest[bpg,tpb](mat, out, dim_size, n)\n",
"cuda.synchronize()\n",
"\n",
"print(\"time taken {} mins\".format((time.time()-st)/60))\n",
"\n",
"result_df = cudf.DataFrame({'word':mappings})\n",
"result_df['nearest']= mappings.iloc[out]\n",
"\n",
"result_df.head(20).to_pandas()\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "qal4dWoWLgoi"
},
"outputs": [],
"source": [
"del result_df, mat, out\n",
"pre_df['0'] = mappings\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "a103o4nNLgok"
},
"outputs": [],
"source": [
"# method containing various rule for cleaning th strings \n",
"def clean_sents(gstr):\n",
" gstr = gstr.replace(r\"[^A-Za-z0-9(),!?\\'\\`]\", \" \")\n",
" gstr = gstr.replace(r\"\\'s\", \" \\'s\")\n",
" gstr = gstr.replace(r\"\\'ve\", \" \\'ve\")\n",
" gstr = gstr.replace(r\"n\\'t\", \" n\\'t\")\n",
" gstr = gstr.replace(r\"\\'re\", \" \\'re\")\n",
" gstr = gstr.replace(r\"\\'d\", \" \\'d\")\n",
" gstr = gstr.replace(r\"\\'ll\", \" \\'ll\")\n",
" gstr = gstr.replace(r\",\", \" , \")\n",
" gstr = gstr.replace(r\"!\", \" ! \")\n",
" gstr = gstr.replace(r\"\\(\", \" \\( \")\n",
" gstr = gstr.replace(r\"\\)\", \" \\) \")\n",
" gstr = gstr.replace(r\"\\?\", \" \\? \")\n",
" gstr = gstr.replace(r\"\\s{2,}\", \" \")\n",
" return gstr.strip().lower()\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "06jInESOWS4J"
},
"source": [
"Loading the dataset `train.csv` directly into GPU memory. dataset contains two columns--\n",
"1. **review** textual data containing the review about a movie\n",
"2. **label** containing 0 or 1 indicating positive or negative sentiment\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"text, label\n",
"A very very very slow-moving aimless movie about a distressed drifting young man.,0\n",
"Not sure who was more lost - the flat characters or the audience nearly half of whom walked out.,0\n",
"Attempting artiness with black & white and clever camera angles the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.,0\n",
"Very little music or anything to speak of.,0\n",
"The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.,1\n",
"The rest of the movie lacks art charm meaning... If it's about emptiness it works I guess because it's empty.,0\n",
"Wasted two hours.,0\n",
"Saw the movie today and thought it was a good effort good messages for kids.,1\n",
"A bit predictable.,0\n"
]
}
],
"source": [
"!wget --quiet https://s3-ap-south-1.amazonaws.com/av-blog-media/wp-content/uploads/2019/01/train.csv\n",
"!head train.csv"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "MSEybU6cLgom"
},
"outputs": [],
"source": [
"sents = cudf.read_csv(\"train.csv\", quoting=3, skiprows=1, names=['review', 'label'])\n",
"y_train = sents['label'].astype('float32').to_gpu_array()\n",
"gstr = sents['review'].data\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "5PH4KEYrXLIf"
},
"source": [
"Setting **MAX_LEN**=20 and then trim down the strings which are greater than MAX_LEN or if the strings are smaller than MA_LEN, add the `PAD` token at the end to make it equal to MAX_LEN.\n",
"\n",
"And then using `nvcategory`, integer id corresponding to each token is generated. "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "sGhPz7m7Lgoo",
"outputId": "a4502d6c-cfdf-466b-eec8-7406348ec79b"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"20000\n",
"2707\n"
]
}
],
"source": [
"MAX_LEN = 20\n",
"num_sents = gstr.size()\n",
"gstr = clean_sents(gstr)\n",
"\n",
"#generate the tokens\n",
"seq = gstr.split_record(' ')\n",
"\n",
"for i in range(len(seq)):\n",
" l = seq[i].size()\n",
" seq[i] = seq[i].add_strings(nvstrings.to_device((MAX_LEN-l)*['PAD'])) if l <=MAX_LEN else seq[i].remove_strings(list(range(MAX_LEN,l)))\n",
"\n",
"#generating the indices corresponding each token \n",
"c = nvcategory.from_strings_list(seq)\n",
"print(len(c.values()))\n",
"print(len(c.keys()))\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "RTS8RzXjLgoq",
"outputId": "1770ae17-de54-42cb-e6d5-5e60b767a735"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" tokens\n",
"0 !\n",
"1 'cover\n",
"2 'd\n",
"3 'film'\n",
"4 'll\n",
"5 'must'\n",
"6 're\n",
"7 's\n",
"8 'so\n",
"9 'stagey'\n",
"10 'titta\n",
"11 've\n",
"12 0\n",
"13 1\n",
"14 10\n",
"15 13\n",
"16 15\n",
"17 15pm\n",
"18 18th\n",
"19 1947\n",
"20 1948\n",
"21 1949\n",
"22 1971\n",
"23 1973\n",
"24 1980\n",
"25 1986\n",
"26 1995\n",
"27 1998\n",
"28 2\n",
"29 2005\n",
"... ...\n",
"2677 wouldnt\n",
"2678 woven\n",
"2679 wow\n",
"2680 write\n",
"2681 writer\n",
"2682 writers\n",
"2683 writing\n",
"2684 written\n",
"2685 wrong\n",
"2686 wrote\n",
"2687 x\n",
"2688 yardley\n",
"2689 yawn\n",
"2690 yeah\n",
"2691 year\n",
"2692 years\n",
"2693 yelps\n",
"2694 yes\n",
"2695 yet\n",
"2696 you\n",
"2697 young\n",
"2698 younger\n",
"2699 your\n",
"2700 youthful\n",
"2701 youtube\n",
"2702 yun\n",
"2703 z\n",
"2704 zillion\n",
"2705 zombie\n",
"2706 zombiez\n",
"\n",
"[2707 rows x 1 columns]\n",
"(1000, 20)\n"
]
}
],
"source": [
"# generating unique tokens \n",
"# print(c.keys())\n",
"sent_df = cudf.DataFrame({'tokens':c.keys()})\n",
"print(sent_df)\n",
"\n",
"# preparing the X_train \n",
"X_train = cuda.device_array((num_sents, MAX_LEN), dtype=np.int32)\n",
"c.values(X_train.device_ctypes_pointer.value)\n",
"print(X_train.shape)\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "knUORYzDY0FK"
},
"source": [
"#### Vocab Creation:\n",
"**sent_df** : contains one column `tokens`, in this column all the unique tokens are stored\n",
"\n",
"**pre_df** : contains 51 columns (`'0'` to `'50'`), in each row, in the column `'0'` one word is stored and in the columns `'1'` to `'50'`, vactor representing the correspoding word.\n",
"\n",
"* Now, to create vocab, `left-join` is performed between `sent_df` and `pre_df` on `tokens` and `'0'` columns of respective dataframes. \n",
"\n",
"* After that, random vectors have been added to those words, which are not there is pre_df.\n",
"\n",
"At the end of this cell, our `vocab` is ready!\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "CIdJBDcoLgot",
"outputId": "a9a82dbc-ae67-41b3-d6e2-ed05f3d26a3c"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2707\n",
"50\n"
]
}
],
"source": [
"vocab_df = sent_df.merge(pre_df,left_on='tokens', right_on='0', how='left')\n",
"vocab_df.drop_column('0')\n",
"vocab_df.drop_column('tokens')\n",
"\n",
"all_token = vocab_df.shape[0]\n",
"print(all_token)\n",
"#calculating the number of tken not found in GloVe \n",
"not_found = vocab_df['1'].null_count\n",
"print(not_found)\n",
"\n",
"# filling the not found tokens with random vector, [now with -1]\n",
"for c in vocab_df.columns:\n",
" vocab_df[c] = vocab_df[c].fillna(cupy.random.normal(size=all_token)).astype(np.float32)\n",
"vocab = vocab_df.as_gpu_matrix(order='C')\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "YnczW9Edb7gB"
},
"source": [
"Created a Toy lstm model, for this problem using `PyTorch`"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "WQ5iltGKLgox"
},
"outputs": [],
"source": [
"def create_emb_layer(weights_matrix, non_trainable=False):\n",
" num_embeddings, embedding_dim = weights_matrix.shape\n",
" emb_layer = nn.Embedding(num_embeddings, embedding_dim)\n",
" emb_layer.weight = nn.Parameter(weights_matrix)\n",
" if non_trainable:\n",
" emb_layer.weight.requires_grad = False\n",
"\n",
" return emb_layer, num_embeddings, embedding_dim\n",
"\n",
"class ToyLSTM(nn.Module):\n",
" def __init__(self, weights_matrix, hidden_size, output_size):\n",
" super(ToyLSTM, self).__init__()\n",
" self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)\n",
" \n",
" self.hidden_size = hidden_size\n",
" self.output_size = output_size\n",
"\n",
" self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)\n",
" self.linear = nn.Linear(hidden_size, hidden_size//2)\n",
" self.out = nn.Linear(hidden_size//2, output_size)\n",
" self.relu = nn.ReLU()\n",
" self.sigmoid = nn.Sigmoid()\n",
" \n",
" def forward(self, inp):\n",
" h_embedding = self.embedding(inp) \n",
" h_lstm, _ = self.lstm(h_embedding)\n",
" max_pool, _ = torch.max(h_lstm, 1)\n",
" linear = self.relu(self.linear(max_pool)) \n",
" out = self.sigmoid(self.out(linear))\n",
" return out"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "sd7FLch4cMqZ"
},
"source": [
"An utitlity method to convert a numba cuda array to pytorch cuda tensor. Currently there is no api for this and this is an open issue [#23067](https://github.com/pytorch/pytorch/issues/23067)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "QeCT74BVLgo0"
},
"outputs": [],
"source": [
"def devndarray2tensor(dev_arr, dtyp='float32'):\n",
" dmap = {'float32':torch.float32, 'int32':torch.int32}\n",
" t = torch.empty(size=dev_arr.shape, dtype=dmap[dtyp]).cuda()\n",
" ctx = cuda.cudadrv.driver.driver.get_context()\n",
" \n",
" # constant value of #bytes in float32 = 4\n",
" mp = cuda.cudadrv.driver.MemoryPointer(ctx, ctypes.c_ulong(t.data_ptr()), t.numel()*4)\n",
" tmp_arr = cuda.cudadrv.devicearray.DeviceNDArray(t.size(), [i*4 for i in t.stride()], np.dtype(dtyp), \n",
" gpu_data=mp, stream=torch.cuda.current_stream().cuda_stream)\n",
" tmp_arr.copy_to_device(dev_arr)\n",
" return t"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "6Yqyj68pLgo2",
"outputId": "93afb75a-9928-4ddb-acbb-453eb6ad0b05"
},
"outputs": [
{
"data": {
"text/plain": [
"ToyLSTM(\n",
" (embedding): Embedding(2707, 300)\n",
" (lstm): LSTM(300, 50, batch_first=True)\n",
" (linear): Linear(in_features=50, out_features=25, bias=True)\n",
" (out): Linear(in_features=25, out_features=1, bias=True)\n",
" (relu): ReLU()\n",
" (sigmoid): Sigmoid()\n",
")"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# instantiate the toy_lstm model\n",
"toy_lstm = ToyLSTM(weights_matrix=devndarray2tensor(vocab), hidden_size=50, output_size=1).cuda()\n",
"toy_lstm"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "jFyUuCkDLgo5",
"outputId": "08c19e74-68ce-4119-82a2-30a7a440a130"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'F_CONTIGUOUS': False, 'C_CONTIGUOUS': True}\n",
"{'F_CONTIGUOUS': True, 'C_CONTIGUOUS': True}\n"
]
}
],
"source": [
"print(X_train.flags)\n",
"print(y_train.flags)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "Ky3vKhnHeO8m"
},
"outputs": [],
"source": [
"## create training and validation split \n",
"split_size = int(0.8 * y_train.shape[0])"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "tZit0-UqLgo7"
},
"outputs": [],
"source": [
"train = TensorDataset(devndarray2tensor(X_train[:split_size], dtyp='int32').to(torch.int64), devndarray2tensor(y_train[:split_size]))\n",
"trainloader = DataLoader(train, batch_size=256)\n",
"\n",
"valid = TensorDataset(devndarray2tensor(X_train[split_size:], dtyp='int32').to(torch.int64), devndarray2tensor(y_train[split_size:]))\n",
"validloader = DataLoader(valid, batch_size=200)"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "5K_LjI7sLgo9"
},
"outputs": [],
"source": [
"loss_function = nn.BCEWithLogitsLoss(reduction='mean')\n",
"optimizer = optim.Adam(toy_lstm.parameters())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "zg9ByLGBLgo_"
},
"outputs": [],
"source": [
"for epoch in range(1, 400):\n",
" train_loss, valid_loss = [], []\n",
"\n",
" # training part\n",
" toy_lstm.train()\n",
" for data, target in trainloader:\n",
" optimizer.zero_grad()\n",
" output = toy_lstm(data)\n",
" loss = loss_function(output, target.view(-1,1))\n",
" loss.backward()\n",
" optimizer.step()\n",
" train_loss.append(loss.item())\n",
" \n",
" ## evaluation part \n",
" toy_lstm.eval()\n",
" for data, target in validloader:\n",
" output = toy_lstm(data)\n",
" pred = torch.sum(torch.round(output))\n",
" #print(pred)\n",
" loss = loss_function(output, target.view(-1,1))\n",
" valid_loss.append(loss.item())\n"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[0.5659271478652954,\n",
" 0.5332825183868408,\n",
" 0.5264097452163696,\n",
" 0.3963638246059418]"
]
},
"execution_count": 126,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_loss"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "OASrs7gELgpB",
"outputId": "d6496cbb-4b9c-439e-ce8c-317ff5700fcd"
},
"outputs": [
{
"data": {
"text/plain": [
"[0.6292966604232788]"
]
},
"execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"valid_loss\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "HtGo7XcrgBK5"
},
"source": [
"Finally, we can obtain the predictions:"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "aKjA9ISIgBtR"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor(89., device='cuda:0', grad_fn=<SumBackward0>)\n",
"tensor(114., device='cuda:0')\n"
]
}
],
"source": [
"dataiter = iter(validloader)\n",
"data, labels = dataiter.next()\n",
"# print(data, labels)\n",
"output = toy_lstm(data)\n",
"# print(\"out\", output, output.shape)\n",
"\n",
"pred = torch.round(output)\n",
"\n",
"print(\"predicted\", torch.sum(pred))\n",
"print(\"actual\", torch.sum(labels))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "e2e_text_classification_gpu.ipynb",
"provenance": [],
"version": "0.3.2"
},
"file_extension": ".py",
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
},
"mimetype": "text/x-python",
"name": "python",
"npconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": 3
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment