kayush2O6/e2e_text_classification_gpu.ipynb

## e2e_text_classification_gpu.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# map the excution to a specific gpu device\n",
    "import os\n",
    "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form",
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 402
    },
    "colab_type": "code",
    "id": "lhCbMQFVLqTl",
    "outputId": "323d6c31-4a87-4559-bb91-81cb69db2108"
   },
   "outputs": [],
   "source": [
    "#@title enviornment sanity check only on Google Colab otherwise comment it out.\n",
    "\n",
    "\n",
    "import pynvml\n",
    "\n",
    "pynvml.nvmlInit()\n",
    "handle = pynvml.nvmlDeviceGetHandleByIndex(0)\n",
    "device_name = pynvml.nvmlDeviceGetName(handle)\n",
    "\n",
    "if device_name != b'Tesla T4':\n",
    "    raise Exception(\"\"\"\n",
    "    Unfortunately Colab didn't give you a T4 GPU.\n",
    "    \n",
    "    Make sure you've configured Colab to request a GPU instance type.\n",
    "    \n",
    "    If you get a K80 GPU, try Runtime -> Reset all runtimes...\n",
    "  \"\"\")\n",
    "else:\n",
    "    print('*********************************************')\n",
    "    print('Woo! Your instance has the right kind of GPU!')\n",
    "    print('*********************************************')\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "DYgrJRnBMIh5"
   },
   "outputs": [],
   "source": [
    "!wget -nc https://github.com/rapidsai/notebooks-extended/raw/master/utils/rapids-colab.sh\n",
    "!bash rapids-colab.sh\n",
    "\n",
    "import sys, os\n",
    "\n",
    "sys.path.append('/usr/local/lib/python3.6/site-packages/')\n",
    "os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'\n",
    "os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "F5ToklT8LgoX",
    "outputId": "ed57a23e-708a-457d-89f7-7a3259ee8350"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nvstrings\n",
    "import nvcategory\n",
    "import cudf\n",
    "import numpy as np\n",
    "from numba import cuda, float32\n",
    "import ctypes\n",
    "import math\n",
    "import cupy\n",
    "import time \n",
    "\n",
    "from torch import nn, optim\n",
    "from torch.autograd import Variable\n",
    "import torch\n",
    "from torch.utils.data import TensorDataset, DataLoader\n",
    "torch.cuda.is_available()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 207
    },
    "colab_type": "code",
    "collapsed": true,
    "id": "Af2FjgEAMSj5",
    "jupyter": {
     "outputs_hidden": true
    },
    "outputId": "17b93035-6922-4906-89ff-dfce90729a9c"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Archive:  glove.6B.zip\n",
      "  inflating: glove.6B.50d.txt        \n",
      "  inflating: glove.6B.100d.txt       \n",
      "  inflating: glove.6B.200d.txt       \n",
      "  inflating: glove.6B.300d.txt       \n",
      "the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581\n",
      ", 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 -0.42852 -0.55641 -0.364 -0.23938 0.13001 -0.063734 -0.39575 -0.48162 0.23291 0.090201 -0.13324 0.078639 -0.41634 -0.15428 0.10068 0.48891 0.31226 -0.1252 -0.037512 -1.5179 0.12612 -0.02442 -0.042961 -0.28351 3.5416 -0.11956 -0.014533 -0.1499 0.21864 -0.33412 -0.13872 0.31806 0.70358 0.44858 -0.080262 0.63003 0.32111 -0.46765 0.22786 0.36034 -0.37818 -0.56657 0.044691 0.30392\n",
      ". 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973 -0.43478 -0.31086 -0.44999 -0.29486 0.16608 0.11963 -0.41328 -0.42353 0.59868 0.28825 -0.11547 -0.041848 -0.67989 -0.25063 0.18472 0.086876 0.46582 0.015035 0.043474 -1.4671 -0.30384 -0.023441 0.30589 -0.21785 3.746 0.0042284 -0.18436 -0.46209 0.098329 -0.11907 0.23919 0.1161 0.41705 0.056763 -6.3681e-05 0.068987 0.087939 -0.10285 -0.13931 0.22314 -0.080803 -0.35652 0.016413 0.10216\n",
      "of 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 0.18157 -0.52393 0.10381 -0.17566 0.078852 -0.36216 -0.11829 -0.83336 0.11917 -0.16605 0.061555 -0.012719 -0.56623 0.013616 0.22851 -0.14396 -0.067549 -0.38157 -0.23698 -1.7037 -0.86692 -0.26704 -0.2589 0.1767 3.8676 -0.1613 -0.13273 -0.68881 0.18444 0.0052464 -0.33874 -0.078956 0.24185 0.36576 -0.34727 0.28483 0.075693 -0.062178 -0.38988 0.22902 -0.21617 -0.22562 -0.093918 -0.80375\n",
      "to 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 -0.41376 0.13228 -0.29847 -0.085253 0.17118 0.22419 -0.10046 -0.43653 0.33418 0.67846 0.057204 -0.34448 -0.42785 -0.43275 0.55963 0.10032 0.18677 -0.26854 0.037334 -2.0932 0.22171 -0.39868 0.20912 -0.55725 3.8826 0.47466 -0.95658 -0.37788 0.20869 -0.32752 0.12751 0.088359 0.16351 -0.21634 -0.094375 0.018324 0.21048 -0.03088 -0.19722 0.082279 -0.09434 -0.073297 -0.064699 -0.26044\n"
     ]
    }
   ],
   "source": [
    "# Downloading the glove word vectors\n",
    "!wget --quiet http://nlp.stanford.edu/data/glove.6B.zip\n",
    "!unzip glove.6B.zip\n",
    "!head -1 glove.6B.50d.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "6i8WqBamUbhx"
   },
   "source": [
    "\n",
    "\n",
    "---\n",
    "\n",
    "\n",
    "\n",
    "---\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "jkVx5bguUOxR"
   },
   "source": [
    "Loading the pretrained word vectors in `pre_df` dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "Imi9PV41Lgoc",
    "outputId": "d031dac2-951d-445c-d36b-cb909f1c3d09"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(400000, 300)\n",
      "float32\n",
      "0      the\n",
      "1        ,\n",
      "2        .\n",
      "3       of\n",
      "4       to\n",
      "5      and\n",
      "6       in\n",
      "7        a\n",
      "8        \"\n",
      "9       's\n",
      "10     for\n",
      "11       -\n",
      "12    that\n",
      "13      on\n",
      "14      is\n",
      "15     was\n",
      "16    said\n",
      "17    with\n",
      "18      he\n",
      "19      as\n",
      "Name: 0, dtype: object\n"
     ]
    }
   ],
   "source": [
    "pre_df = cudf.read_csv(\"glove.6B.300d.txt\", header=None, delim_whitespace=True, quoting=3)  #ignore quoting\n",
    "\n",
    "mappings = pre_df['0']\n",
    "\n",
    "pre_df.drop_column('0')\n",
    "for c in pre_df.columns:\n",
    "    pre_df[c] = pre_df[c].astype(np.float32)\n",
    "mat = pre_df.as_gpu_matrix()\n",
    "\n",
    "print(mat.shape)\n",
    "print(mat.dtype)\n",
    "print(mappings.head(20))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "OE8aX1-EUko-"
   },
   "source": [
    "**gpu implementaion of cosine similarity** to find the nearest the word with respect to each word."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "sUkyYS0nLgoe"
   },
   "outputs": [],
   "source": [
    "@cuda.jit(device=True)\n",
    "def dot(a, b, dim_size):\n",
    "    summ = 0\n",
    "    for i in range(dim_size):\n",
    "        summ += (a[i]*b[i])\n",
    "    return summ\n",
    "\n",
    "@cuda.jit(device=True)\n",
    "def cosine_sim(a, b, dim_size):\n",
    "    return dot(a,b, dim_size) / ( math.sqrt(dot(a, a, dim_size)) * math.sqrt(dot(b, b, dim_size)) )\n",
    "\n",
    "@cuda.jit('void(float32[:,:], int32[:], int32, int32)')\n",
    "def find_nearest(mat, out, dim_size, n):\n",
    "    idx = cuda.threadIdx.x + cuda.blockDim.x * cuda.blockIdx.x\n",
    "    if idx >= n:\n",
    "        return\n",
    "    c = -1.0 \n",
    "    c_i = idx\n",
    "    # here is room for improvement using shared memory \n",
    "    for i in range(n):\n",
    "        if i == idx:\n",
    "            continue\n",
    "        csim = cosine_sim(mat[idx], mat[i], dim_size)\n",
    "        if csim >= c:\n",
    "            c_i = i\n",
    "            c = csim\n",
    "    \n",
    "    out[idx] = c_i\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "j_uN3yJOVDfH"
   },
   "source": [
    "Invoking the `find_similarity()` kernel with appropriate configurations (**bpg**:block _per_grid, **tbp**:thread_per_block )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "2cx8E8ybLgog",
    "outputId": "6ef8ccd1-8c39-425f-e5d3-36898124eefc"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "kernel launch configuraion:  128 3125\n",
      "time taken 13.944092090924581 mins\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>word</th>\n",
       "      <th>nearest</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>the</td>\n",
       "      <td>of</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>,</td>\n",
       "      <td>the</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>.</td>\n",
       "      <td>.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>of</td>\n",
       "      <td>the</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>to</td>\n",
       "      <td>,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>and</td>\n",
       "      <td>the</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>in</td>\n",
       "      <td>the</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>a</td>\n",
       "      <td>the</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>\"</td>\n",
       "      <td>help</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>'s</td>\n",
       "      <td>the</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>for</td>\n",
       "      <td>both</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>-</td>\n",
       "      <td>the</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>that</td>\n",
       "      <td>where</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>on</td>\n",
       "      <td>the</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>is</td>\n",
       "      <td>another</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>was</td>\n",
       "      <td>the</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>said</td>\n",
       "      <td>“</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>with</td>\n",
       "      <td>the</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>he</td>\n",
       "      <td>’s</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>as</td>\n",
       "      <td>the</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    word  nearest\n",
       "0    the       of\n",
       "1      ,      the\n",
       "2      .        .\n",
       "3     of      the\n",
       "4     to        ,\n",
       "5    and      the\n",
       "6     in      the\n",
       "7      a      the\n",
       "8      \"     help\n",
       "9     's      the\n",
       "10   for     both\n",
       "11     -      the\n",
       "12  that    where\n",
       "13    on      the\n",
       "14    is  another\n",
       "15   was      the\n",
       "16  said        “\n",
       "17  with      the\n",
       "18    he       ’s\n",
       "19    as      the"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n = mat.shape[0]\n",
    "dim_size = mat.shape[1]\n",
    "device = cuda.get_current_device()\n",
    "\n",
    "tpb = 128 #device.WARP_SIZE    #blocksize or thread per block\n",
    "bpg = int(np.ceil((n)/tpb))  # block per grid\n",
    "print( \"kernel launch configuraion: \", tpb, bpg)\n",
    "\n",
    "out = cuda.device_array(shape=n, dtype=np.int32)\n",
    "\n",
    "st = time.time()\n",
    "find_nearest[bpg,tpb](mat, out, dim_size, n)\n",
    "cuda.synchronize()\n",
    "\n",
    "print(\"time taken {} mins\".format((time.time()-st)/60))\n",
    "\n",
    "result_df = cudf.DataFrame({'word':mappings})\n",
    "result_df['nearest']= mappings.iloc[out]\n",
    "\n",
    "result_df.head(20).to_pandas()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "qal4dWoWLgoi"
   },
   "outputs": [],
   "source": [
    "del result_df, mat, out\n",
    "pre_df['0'] = mappings\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "a103o4nNLgok"
   },
   "outputs": [],
   "source": [
    "# method containing various rule for cleaning th strings \n",
    "def clean_sents(gstr):\n",
    "    gstr = gstr.replace(r\"[^A-Za-z0-9(),!?\\'\\`]\", \" \")\n",
    "    gstr = gstr.replace(r\"\\'s\", \" \\'s\")\n",
    "    gstr = gstr.replace(r\"\\'ve\", \" \\'ve\")\n",
    "    gstr = gstr.replace(r\"n\\'t\", \" n\\'t\")\n",
    "    gstr = gstr.replace(r\"\\'re\", \" \\'re\")\n",
    "    gstr = gstr.replace(r\"\\'d\", \" \\'d\")\n",
    "    gstr = gstr.replace(r\"\\'ll\", \" \\'ll\")\n",
    "    gstr = gstr.replace(r\",\", \" , \")\n",
    "    gstr = gstr.replace(r\"!\", \" ! \")\n",
    "    gstr = gstr.replace(r\"\\(\", \" \\( \")\n",
    "    gstr = gstr.replace(r\"\\)\", \" \\) \")\n",
    "    gstr = gstr.replace(r\"\\?\", \" \\? \")\n",
    "    gstr = gstr.replace(r\"\\s{2,}\", \" \")\n",
    "    return gstr.strip().lower()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "06jInESOWS4J"
   },
   "source": [
    "Loading the dataset `train.csv` directly into GPU memory. dataset contains two columns--\n",
    "1. **review** textual data containing the review about a movie\n",
    "2. **label**  containing 0 or 1 indicating positive or negative sentiment\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "text, label\n",
      "A very  very  very slow-moving  aimless movie about a distressed  drifting young man.,0\n",
      "Not sure who was more lost - the flat characters or the audience  nearly half of whom walked out.,0\n",
      "Attempting artiness with black & white and clever camera angles  the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.,0\n",
      "Very little music or anything to speak of.,0\n",
      "The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.,1\n",
      "The rest of the movie lacks art  charm  meaning... If it's about emptiness  it works I guess because it's empty.,0\n",
      "Wasted two hours.,0\n",
      "Saw the movie today and thought it was a good effort  good messages for kids.,1\n",
      "A bit predictable.,0\n"
     ]
    }
   ],
   "source": [
    "!wget --quiet https://s3-ap-south-1.amazonaws.com/av-blog-media/wp-content/uploads/2019/01/train.csv\n",
    "!head train.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "MSEybU6cLgom"
   },
   "outputs": [],
   "source": [
    "sents = cudf.read_csv(\"train.csv\", quoting=3, skiprows=1, names=['review', 'label'])\n",
    "y_train = sents['label'].astype('float32').to_gpu_array()\n",
    "gstr = sents['review'].data\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "5PH4KEYrXLIf"
   },
   "source": [
    "Setting **MAX_LEN**=20 and then trim down the strings which are greater than MAX_LEN or if the strings are smaller than MA_LEN, add the `PAD` token at the end to make it equal to MAX_LEN.\n",
    "\n",
    "And then using `nvcategory`, integer id corresponding to each token is generated. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "sGhPz7m7Lgoo",
    "outputId": "a4502d6c-cfdf-466b-eec8-7406348ec79b"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20000\n",
      "2707\n"
     ]
    }
   ],
   "source": [
    "MAX_LEN = 20\n",
    "num_sents = gstr.size()\n",
    "gstr = clean_sents(gstr)\n",
    "\n",
    "#generate the tokens\n",
    "seq = gstr.split_record(' ')\n",
    "\n",
    "for i in range(len(seq)):\n",
    "    l = seq[i].size()\n",
    "    seq[i] = seq[i].add_strings(nvstrings.to_device((MAX_LEN-l)*['PAD'])) if l <=MAX_LEN else seq[i].remove_strings(list(range(MAX_LEN,l)))\n",
    "\n",
    "#generating the indices corresponding each token \n",
    "c = nvcategory.from_strings_list(seq)\n",
    "print(len(c.values()))\n",
    "print(len(c.keys()))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "RTS8RzXjLgoq",
    "outputId": "1770ae17-de54-42cb-e6d5-5e60b767a735"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        tokens\n",
      "0            !\n",
      "1       'cover\n",
      "2           'd\n",
      "3       'film'\n",
      "4          'll\n",
      "5       'must'\n",
      "6          're\n",
      "7           's\n",
      "8          'so\n",
      "9     'stagey'\n",
      "10      'titta\n",
      "11         've\n",
      "12           0\n",
      "13           1\n",
      "14          10\n",
      "15          13\n",
      "16          15\n",
      "17        15pm\n",
      "18        18th\n",
      "19        1947\n",
      "20        1948\n",
      "21        1949\n",
      "22        1971\n",
      "23        1973\n",
      "24        1980\n",
      "25        1986\n",
      "26        1995\n",
      "27        1998\n",
      "28           2\n",
      "29        2005\n",
      "...        ...\n",
      "2677   wouldnt\n",
      "2678     woven\n",
      "2679       wow\n",
      "2680     write\n",
      "2681    writer\n",
      "2682   writers\n",
      "2683   writing\n",
      "2684   written\n",
      "2685     wrong\n",
      "2686     wrote\n",
      "2687         x\n",
      "2688   yardley\n",
      "2689      yawn\n",
      "2690      yeah\n",
      "2691      year\n",
      "2692     years\n",
      "2693     yelps\n",
      "2694       yes\n",
      "2695       yet\n",
      "2696       you\n",
      "2697     young\n",
      "2698   younger\n",
      "2699      your\n",
      "2700  youthful\n",
      "2701   youtube\n",
      "2702       yun\n",
      "2703         z\n",
      "2704   zillion\n",
      "2705    zombie\n",
      "2706   zombiez\n",
      "\n",
      "[2707 rows x 1 columns]\n",
      "(1000, 20)\n"
     ]
    }
   ],
   "source": [
    "# generating unique tokens \n",
    "# print(c.keys())\n",
    "sent_df = cudf.DataFrame({'tokens':c.keys()})\n",
    "print(sent_df)\n",
    "\n",
    "# preparing the X_train \n",
    "X_train = cuda.device_array((num_sents, MAX_LEN), dtype=np.int32)\n",
    "c.values(X_train.device_ctypes_pointer.value)\n",
    "print(X_train.shape)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "knUORYzDY0FK"
   },
   "source": [
    "#### Vocab Creation:\n",
    "**sent_df** : contains one column `tokens`, in this column all the unique tokens are stored\n",
    "\n",
    "**pre_df** : contains 51 columns (`'0'` to `'50'`), in each row, in the column `'0'` one word is stored and in the columns `'1'` to `'50'`, vactor representing the correspoding word.\n",
    "\n",
    "* Now, to create vocab, `left-join` is performed between `sent_df` and `pre_df` on `tokens` and `'0'` columns of respective dataframes. \n",
    "\n",
    "* After that, random vectors have been added to those words, which are not there is pre_df.\n",
    "\n",
    "At the end of this cell, our `vocab` is ready!\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "CIdJBDcoLgot",
    "outputId": "a9a82dbc-ae67-41b3-d6e2-ed05f3d26a3c"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2707\n",
      "50\n"
     ]
    }
   ],
   "source": [
    "vocab_df = sent_df.merge(pre_df,left_on='tokens', right_on='0', how='left')\n",
    "vocab_df.drop_column('0')\n",
    "vocab_df.drop_column('tokens')\n",
    "\n",
    "all_token = vocab_df.shape[0]\n",
    "print(all_token)\n",
    "#calculating the number of tken not found in GloVe \n",
    "not_found = vocab_df['1'].null_count\n",
    "print(not_found)\n",
    "\n",
    "# filling the not found tokens with random vector, [now with -1]\n",
    "for c in vocab_df.columns:\n",
    "    vocab_df[c] = vocab_df[c].fillna(cupy.random.normal(size=all_token)).astype(np.float32)\n",
    "vocab = vocab_df.as_gpu_matrix(order='C')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "YnczW9Edb7gB"
   },
   "source": [
    "Created a Toy lstm model, for this problem using `PyTorch`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "WQ5iltGKLgox"
   },
   "outputs": [],
   "source": [
    "def create_emb_layer(weights_matrix, non_trainable=False):\n",
    "    num_embeddings, embedding_dim = weights_matrix.shape\n",
    "    emb_layer = nn.Embedding(num_embeddings, embedding_dim)\n",
    "    emb_layer.weight = nn.Parameter(weights_matrix)\n",
    "    if non_trainable:\n",
    "        emb_layer.weight.requires_grad = False\n",
    "\n",
    "    return emb_layer, num_embeddings, embedding_dim\n",
    "\n",
    "class ToyLSTM(nn.Module):\n",
    "    def __init__(self, weights_matrix, hidden_size, output_size):\n",
    "        super(ToyLSTM, self).__init__()\n",
    "        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)\n",
    "         \n",
    "        self.hidden_size = hidden_size\n",
    "        self.output_size = output_size\n",
    "\n",
    "        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)\n",
    "        self.linear = nn.Linear(hidden_size, hidden_size//2)\n",
    "        self.out = nn.Linear(hidden_size//2, output_size)\n",
    "        self.relu = nn.ReLU()\n",
    "        self.sigmoid = nn.Sigmoid()\n",
    "        \n",
    "    def forward(self, inp):\n",
    "        h_embedding = self.embedding(inp) \n",
    "        h_lstm, _ = self.lstm(h_embedding)\n",
    "        max_pool, _ = torch.max(h_lstm, 1)\n",
    "        linear = self.relu(self.linear(max_pool)) \n",
    "        out = self.sigmoid(self.out(linear))\n",
    "        return out"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "sd7FLch4cMqZ"
   },
   "source": [
    "An utitlity method to convert a numba cuda array to pytorch cuda tensor. Currently there is no api for this and this is an open issue [#23067](https://github.com/pytorch/pytorch/issues/23067)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "QeCT74BVLgo0"
   },
   "outputs": [],
   "source": [
    "def devndarray2tensor(dev_arr, dtyp='float32'):\n",
    "    dmap = {'float32':torch.float32, 'int32':torch.int32}\n",
    "    t = torch.empty(size=dev_arr.shape, dtype=dmap[dtyp]).cuda()\n",
    "    ctx = cuda.cudadrv.driver.driver.get_context()\n",
    "    \n",
    "    # constant value of #bytes in float32 = 4\n",
    "    mp = cuda.cudadrv.driver.MemoryPointer(ctx, ctypes.c_ulong(t.data_ptr()), t.numel()*4)\n",
    "    tmp_arr = cuda.cudadrv.devicearray.DeviceNDArray(t.size(), [i*4 for i in t.stride()], np.dtype(dtyp), \n",
    "                                            gpu_data=mp, stream=torch.cuda.current_stream().cuda_stream)\n",
    "    tmp_arr.copy_to_device(dev_arr)\n",
    "    return t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "6Yqyj68pLgo2",
    "outputId": "93afb75a-9928-4ddb-acbb-453eb6ad0b05"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ToyLSTM(\n",
       "  (embedding): Embedding(2707, 300)\n",
       "  (lstm): LSTM(300, 50, batch_first=True)\n",
       "  (linear): Linear(in_features=50, out_features=25, bias=True)\n",
       "  (out): Linear(in_features=25, out_features=1, bias=True)\n",
       "  (relu): ReLU()\n",
       "  (sigmoid): Sigmoid()\n",
       ")"
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# instantiate the toy_lstm model\n",
    "toy_lstm = ToyLSTM(weights_matrix=devndarray2tensor(vocab), hidden_size=50, output_size=1).cuda()\n",
    "toy_lstm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "jFyUuCkDLgo5",
    "outputId": "08c19e74-68ce-4119-82a2-30a7a440a130"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'F_CONTIGUOUS': False, 'C_CONTIGUOUS': True}\n",
      "{'F_CONTIGUOUS': True, 'C_CONTIGUOUS': True}\n"
     ]
    }
   ],
   "source": [
    "print(X_train.flags)\n",
    "print(y_train.flags)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "Ky3vKhnHeO8m"
   },
   "outputs": [],
   "source": [
    "## create training and validation split \n",
    "split_size = int(0.8 * y_train.shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "tZit0-UqLgo7"
   },
   "outputs": [],
   "source": [
    "train = TensorDataset(devndarray2tensor(X_train[:split_size], dtyp='int32').to(torch.int64), devndarray2tensor(y_train[:split_size]))\n",
    "trainloader = DataLoader(train, batch_size=256)\n",
    "\n",
    "valid = TensorDataset(devndarray2tensor(X_train[split_size:], dtyp='int32').to(torch.int64), devndarray2tensor(y_train[split_size:]))\n",
    "validloader = DataLoader(valid, batch_size=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "5K_LjI7sLgo9"
   },
   "outputs": [],
   "source": [
    "loss_function = nn.BCEWithLogitsLoss(reduction='mean')\n",
    "optimizer = optim.Adam(toy_lstm.parameters())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "zg9ByLGBLgo_"
   },
   "outputs": [],
   "source": [
    "for epoch in range(1, 400):\n",
    "    train_loss, valid_loss = [], []\n",
    "\n",
    "    # training part\n",
    "    toy_lstm.train()\n",
    "    for data, target in trainloader:\n",
    "        optimizer.zero_grad()\n",
    "        output = toy_lstm(data)\n",
    "        loss = loss_function(output, target.view(-1,1))\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "        train_loss.append(loss.item())\n",
    "    \n",
    "    ## evaluation part \n",
    "    toy_lstm.eval()\n",
    "    for data, target in validloader:\n",
    "        output = toy_lstm(data)\n",
    "        pred = torch.sum(torch.round(output))\n",
    "        #print(pred)\n",
    "        loss = loss_function(output, target.view(-1,1))\n",
    "        valid_loss.append(loss.item())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0.5659271478652954,\n",
       " 0.5332825183868408,\n",
       " 0.5264097452163696,\n",
       " 0.3963638246059418]"
      ]
     },
     "execution_count": 126,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "OASrs7gELgpB",
    "outputId": "d6496cbb-4b9c-439e-ce8c-317ff5700fcd"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0.6292966604232788]"
      ]
     },
     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "valid_loss\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "HtGo7XcrgBK5"
   },
   "source": [
    "Finally, we can obtain the predictions:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "aKjA9ISIgBtR"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(89., device='cuda:0', grad_fn=<SumBackward0>)\n",
      "tensor(114., device='cuda:0')\n"
     ]
    }
   ],
   "source": [
    "dataiter = iter(validloader)\n",
    "data, labels = dataiter.next()\n",
    "# print(data, labels)\n",
    "output = toy_lstm(data)\n",
    "# print(\"out\", output, output.shape)\n",
    "\n",
    "pred = torch.round(output)\n",
    "\n",
    "print(\"predicted\", torch.sum(pred))\n",
    "print(\"actual\", torch.sum(labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "e2e_text_classification_gpu.ipynb",
   "provenance": [],
   "version": "0.3.2"
  },
  "file_extension": ".py",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "mimetype": "text/x-python",
  "name": "python",
  "npconvert_exporter": "python",
  "pygments_lexer": "ipython3",
  "version": 3
 },
 "nbformat": 4,
 "nbformat_minor": 4
}