myselfHimanshu/BOW.ipynb

## BOW.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Lec1-BOW.ipynb",
      "version": "0.3.2",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "odL47DwX36L-",
        "colab_type": "text"
      },
      "source": [
        "## Intro to Neural Networks for NLP\n",
        "\n",
        "Labels = {\n",
        "          \"very_bad\":0,\n",
        "          \"bad\":1,\n",
        "          \"neutral\":2,\n",
        "          \"good\":3,\n",
        "          \"very_good\":4\n",
        "}"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "53ztcfps3t-N",
        "colab_type": "code",
        "outputId": "c3ce8900-7352-4998-941e-9abecea760b5",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "\"\"\"\n",
        "mount google drive, change directory and download the dataset\n",
        "\"\"\"\n",
        "\n",
        "from google.colab import drive\n",
        "drive.mount(\"/content/drive\")"
      ],
      "execution_count": 27,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "LT8yhQUe4oqU",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import os\n",
        "os.chdir(\"./drive/My Drive/CMUNN4NLP\")"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "bs0N3kru5dXk",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "\"\"\"\n",
        "download the files\n",
        "\"\"\"\n",
        "# !wget -c https://github.com/neubig/nn4nlp-code/raw/master/data/classes/dev.txt\n",
        "# !wget -c https://github.com/neubig/nn4nlp-code/raw/master/data/classes/test.txt\n",
        "# !wget -c https://github.com/neubig/nn4nlp-code/raw/master/data/classes/train.txt"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "VVeUmUUw55Vs",
        "colab_type": "text"
      },
      "source": [
        "## Implementation"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "auLTNJX55ypW",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.nn.functional as F\n",
        "import torch.optim as optim"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "_QyEUk6P6D_g",
        "colab_type": "code",
        "outputId": "d4f24e52-01b0-409f-ac0f-29de3424a185",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "use_cuda = True\n",
        "print(\"CUDA Available : \", torch.cuda.is_available())\n",
        "device = torch.device(\"cuda\" if (use_cuda and torch.cuda.is_available()) else \"cpu\")"
      ],
      "execution_count": 30,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "CUDA Available :  True\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "zqKy9rz26Zic",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#read the datafiles\n",
        "import pandas as pd\n",
        "\n",
        "train_data = pd.read_csv(\"./train.txt\", delimiter=\"|\", names=['label','remove','removel','text']).drop([\"remove\",\"removel\"], axis=1)\n",
        "test_data = pd.read_csv(\"./test.txt\", delimiter=\"|\", names=['label','remove','removel','text']).drop([\"remove\",\"removel\"], axis=1)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "q6eNvg_Q6yKz",
        "colab_type": "code",
        "outputId": "cced2e27-c47a-40fa-8c8e-a4e1531a7a65",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 204
        }
      },
      "source": [
        "train_data.head()"
      ],
      "execution_count": 32,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>label</th>\n",
              "      <th>text</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>3</td>\n",
              "      <td>The Rock is destined to be the 21st Century '...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>4</td>\n",
              "      <td>The gorgeously elaborate continuation of `` T...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>3</td>\n",
              "      <td>Singer\\/composer Bryan Adams contributes a sl...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>2</td>\n",
              "      <td>You 'd think by now America would have had en...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>3</td>\n",
              "      <td>Yet the act is still charming here .</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   label                                               text\n",
              "0      3   The Rock is destined to be the 21st Century '...\n",
              "1      4   The gorgeously elaborate continuation of `` T...\n",
              "2      3   Singer\\/composer Bryan Adams contributes a sl...\n",
              "3      2   You 'd think by now America would have had en...\n",
              "4      3               Yet the act is still charming here ."
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 32
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "G_1m48hB665J",
        "colab_type": "code",
        "outputId": "91361f5f-bc4e-461c-d9d2-6493b21fcd95",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "\"\"\"\n",
        "create word to index vocabulary dictionary\n",
        "\"\"\"\n",
        "\n",
        "word_2_indx = {}\n",
        "sentences = \" \".join(train_data.text.values)\n",
        "\n",
        "word_2_indx = dict([(y,x) for x,y in enumerate(set(sentences.split()))])\n",
        "print(\"Vocab Size : {}\".format(len(word_2_indx)))"
      ],
      "execution_count": 33,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Vocab Size : 18278\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oZduu1FL7zii",
        "colab_type": "code",
        "outputId": "7865f5d0-ee23-4644-ef5a-4e1a7254095a",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 53
        }
      },
      "source": [
        "vocab_size = len(word_2_indx)\n",
        "num_labels = len(set(train_data[\"label\"].values))\n",
        "print(\"Vocab Size : \", vocab_size)\n",
        "print(\"No. Labels : \", num_labels)"
      ],
      "execution_count": 34,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Vocab Size :  18278\n",
            "No. Labels :  5\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Clw37c5E8Ihs",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "\"\"\"\n",
        "Create a NN model\n",
        "\"\"\"\n",
        "\n",
        "class BOW(nn.Module):\n",
        "\n",
        "  def __init__(self, num_labels, vocab_size):\n",
        "    super(BOW, self).__init__()\n",
        "\n",
        "    self.linear = nn.Linear(vocab_size, num_labels)\n",
        "\n",
        "  def forward(self, bow_vec):\n",
        "    return F.log_softmax(self.linear(bow_vec), dim=1)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "dvEc4zQv-vnS",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def make_bow_vec(sentence, word_2_indx):\n",
        "  vec = torch.zeros(len(word_2_indx))\n",
        "  for word in sentence.split():\n",
        "    if word in word_2_indx:\n",
        "      vec[word_2_indx[word]] += 1\n",
        "\n",
        "  return vec.view(1,-1)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "1Jka_JGN_O0P",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def make_target(label, label_2_indx):\n",
        "  return torch.LongTensor([label_2_indx[\"%s\"%label]])"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "bUbgIStG_hAN",
        "colab_type": "code",
        "outputId": "87eba8a8-e675-40fa-d7c2-cdda1360aa8e",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 71
        }
      },
      "source": [
        "model = BOW(num_labels=num_labels, vocab_size=vocab_size)\n",
        "model.to(device)"
      ],
      "execution_count": 38,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "BOW(\n",
              "  (linear): Linear(in_features=18278, out_features=5, bias=True)\n",
              ")"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 38
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "fLyacxxL_q5l",
        "colab_type": "code",
        "outputId": "b66f57da-d4c0-4ba6-8240-c1bc5df00f75",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 197
        }
      },
      "source": [
        "for param in model.parameters():\n",
        "  print(param)"
      ],
      "execution_count": 39,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Parameter containing:\n",
            "tensor([[ 0.0070,  0.0040,  0.0071,  ..., -0.0038, -0.0010, -0.0032],\n",
            "        [ 0.0048, -0.0052, -0.0025,  ..., -0.0071, -0.0069,  0.0045],\n",
            "        [-0.0047,  0.0037, -0.0050,  ..., -0.0037,  0.0021, -0.0046],\n",
            "        [-0.0008,  0.0028,  0.0054,  ...,  0.0034, -0.0069, -0.0021],\n",
            "        [ 0.0045, -0.0035,  0.0029,  ...,  0.0064, -0.0046, -0.0049]],\n",
            "       device='cuda:0', requires_grad=True)\n",
            "Parameter containing:\n",
            "tensor([-0.0054, -0.0062, -0.0034,  0.0044,  0.0022], device='cuda:0',\n",
            "       requires_grad=True)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ynemL_Tw_xvT",
        "colab_type": "code",
        "outputId": "9104553d-1138-4aad-d0bc-67b0d2e5df5c",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "with torch.no_grad():\n",
        "  sample = train_data[\"text\"].iloc[0]\n",
        "  bow_vector = make_bow_vec(sample, word_2_indx).to(device)\n",
        "  log_probs = model(bow_vector)\n",
        "  print(log_probs)"
      ],
      "execution_count": 40,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "tensor([[-1.6320, -1.6246, -1.6060, -1.5794, -1.6061]], device='cuda:0')\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "XXxIOLYRAqZ4",
        "colab_type": "code",
        "outputId": "073bfa83-bfef-4de2-a7b3-62b7e14ba1ed",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "set(train_data[\"label\"].values)"
      ],
      "execution_count": 41,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{0, 1, 2, 3, 4}"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 41
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "iuMVzCFtAwFU",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "label_2_indx = {\"0\":0,\"1\":1,\"2\":2,\"3\":3,\"4\":4}"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Hc2Yk-_cA5Ru",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "loss_function = nn.NLLLoss()\n",
        "optimizer = optim.Adam(model.parameters(), lr=0.1)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "K5vxYzVDBG8-",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "data = train_data[[\"text\",\"label\"]].values\n",
        "t_data = test_data[[\"text\",\"label\"]].values"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "q63y2MJ4BONd",
        "colab_type": "code",
        "outputId": "c495274a-d81a-48a3-b412-b8b7b7c2555b",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 73
        }
      },
      "source": [
        "data[0]"
      ],
      "execution_count": 50,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([\" The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .\",\n",
              "       3], dtype=object)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 50
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "9eJf2Rm-BPJ2",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import time\n",
        "import numpy as np"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "my0ln4sVBTA7",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 755
        },
        "outputId": "9b30d740-e407-43e5-ae32-5046e13a00cf"
      },
      "source": [
        "for epoch in range(10):\n",
        "  start = time.time()\n",
        "\n",
        "  total_loss = 0\n",
        "\n",
        "  for instance in data:\n",
        "    text = instance[0]\n",
        "    label = instance[1]\n",
        "\n",
        "    #pytorch accumulates data clear it out\n",
        "    model.zero_grad()\n",
        "\n",
        "    #make bow vector\n",
        "    bow_vec = make_bow_vec(text, word_2_indx).to(device)\n",
        "    target = make_target(label, label_2_indx).to(device)\n",
        "\n",
        "    #run the forward pass\n",
        "    prediction = model(bow_vec)\n",
        "\n",
        "    #compute the loss, gradients, and update the parameters\n",
        "    loss = loss_function(prediction, target)\n",
        "    total_loss += loss.item()\n",
        "    loss.backward()\n",
        "    optimizer.step()\n",
        "\n",
        "\n",
        "  #testing accuracy\n",
        "  test_correct = 0.0\n",
        "  for instance in t_data:\n",
        "    model.eval()\n",
        "    text = instance[0]\n",
        "    label = instance[1]\n",
        "\n",
        "    #Make BOW vector\n",
        "    bow_vec = make_bow_vec(text, word_2_indx).to(device)\n",
        "    target = make_target(label, label_2_indx).to(device)\n",
        "\n",
        "    scores = model(bow_vec).detach().cpu().numpy()\n",
        "\n",
        "    predict = np.argmax(scores)\n",
        "\n",
        "    if predict==label:\n",
        "      test_correct += 1\n",
        "\n",
        "  print(\"Epoch {}\".format(epoch))\n",
        "  print(\"Loss {}\".format(total_loss/len(data)))\n",
        "  print(\"Testing Accuracy {}\".format(test_correct/len(t_data)))\n",
        "  print(\"------\")\n",
        "  model.train()\n",
        "\n",
        "\n",
        "print(\"Time Take : {}\", time.time()-start)"
      ],
      "execution_count": 52,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Epoch 0\n",
            "Loss 8.314135334722788\n",
            "Testing Accuracy 0.3180995475113122\n",
            "------\n",
            "Epoch 1\n",
            "Loss 3.1771007349212352\n",
            "Testing Accuracy 0.33167420814479637\n",
            "------\n",
            "Epoch 2\n",
            "Loss 1.8018128684188208\n",
            "Testing Accuracy 0.3257918552036199\n",
            "------\n",
            "Epoch 3\n",
            "Loss 1.2535895682724842\n",
            "Testing Accuracy 0.3176470588235294\n",
            "------\n",
            "Epoch 4\n",
            "Loss 0.9204622364780876\n",
            "Testing Accuracy 0.32760180995475113\n",
            "------\n",
            "Epoch 5\n",
            "Loss 0.6450639376591207\n",
            "Testing Accuracy 0.3239819004524887\n",
            "------\n",
            "Epoch 6\n",
            "Loss 0.5178571962908413\n",
            "Testing Accuracy 0.3095022624434389\n",
            "------\n",
            "Epoch 7\n",
            "Loss 0.3698097968592626\n",
            "Testing Accuracy 0.31040723981900453\n",
            "------\n",
            "Epoch 8\n",
            "Loss 0.32668769638636586\n",
            "Testing Accuracy 0.304524886877828\n",
            "------\n",
            "Epoch 9\n",
            "Loss 0.29391382461630006\n",
            "Testing Accuracy 0.31085972850678734\n",
            "------\n",
            "Time Take : {} 11.080454587936401\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "zgNN54tnDUcw",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "Lec1-BOW.ipynb",
	"version": "0.3.2",
	"provenance": [],
	"collapsed_sections": []
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"accelerator": "GPU"
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "odL47DwX36L-",
	"colab_type": "text"
	},
	"source": [
	"## Intro to Neural Networks for NLP\n",
	"\n",
	"Labels = {\n",
	" \"very_bad\":0,\n",
	" \"bad\":1,\n",
	" \"neutral\":2,\n",
	" \"good\":3,\n",
	" \"very_good\":4\n",
	"}"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "53ztcfps3t-N",
	"colab_type": "code",
	"outputId": "c3ce8900-7352-4998-941e-9abecea760b5",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 35
	}
	},
	"source": [
	"\"\"\"\n",
	"mount google drive, change directory and download the dataset\n",
	"\"\"\"\n",
	"\n",
	"from google.colab import drive\n",
	"drive.mount(\"/content/drive\")"
	],
	"execution_count": 27,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "LT8yhQUe4oqU",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"import os\n",
	"os.chdir(\"./drive/My Drive/CMUNN4NLP\")"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "bs0N3kru5dXk",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"\"\"\"\n",
	"download the files\n",
	"\"\"\"\n",
	"# !wget -c https://github.com/neubig/nn4nlp-code/raw/master/data/classes/dev.txt\n",
	"# !wget -c https://github.com/neubig/nn4nlp-code/raw/master/data/classes/test.txt\n",
	"# !wget -c https://github.com/neubig/nn4nlp-code/raw/master/data/classes/train.txt"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "VVeUmUUw55Vs",
	"colab_type": "text"
	},
	"source": [
	"## Implementation"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "auLTNJX55ypW",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"import torch\n",
	"import torch.nn as nn\n",
	"import torch.nn.functional as F\n",
	"import torch.optim as optim"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "_QyEUk6P6D_g",
	"colab_type": "code",
	"outputId": "d4f24e52-01b0-409f-ac0f-29de3424a185",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 35
	}
	},
	"source": [
	"use_cuda = True\n",
	"print(\"CUDA Available : \", torch.cuda.is_available())\n",
	"device = torch.device(\"cuda\" if (use_cuda and torch.cuda.is_available()) else \"cpu\")"
	],
	"execution_count": 30,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"CUDA Available : True\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "zqKy9rz26Zic",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"#read the datafiles\n",
	"import pandas as pd\n",
	"\n",
	"train_data = pd.read_csv(\"./train.txt\", delimiter=\"\|\", names=['label','remove','removel','text']).drop([\"remove\",\"removel\"], axis=1)\n",
	"test_data = pd.read_csv(\"./test.txt\", delimiter=\"\|\", names=['label','remove','removel','text']).drop([\"remove\",\"removel\"], axis=1)"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "q6eNvg_Q6yKz",
	"colab_type": "code",
	"outputId": "cced2e27-c47a-40fa-8c8e-a4e1531a7a65",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 204
	}
	},
	"source": [
	"train_data.head()"
	],
	"execution_count": 32,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>label</th>\n",
	" <th>text</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>3</td>\n",
	" <td>The Rock is destined to be the 21st Century '...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>4</td>\n",
	" <td>The gorgeously elaborate continuation of `` T...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>3</td>\n",
	" <td>Singer\\/composer Bryan Adams contributes a sl...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>2</td>\n",
	" <td>You 'd think by now America would have had en...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>3</td>\n",
	" <td>Yet the act is still charming here .</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" label text\n",
	"0 3 The Rock is destined to be the 21st Century '...\n",
	"1 4 The gorgeously elaborate continuation of `` T...\n",
	"2 3 Singer\\/composer Bryan Adams contributes a sl...\n",
	"3 2 You 'd think by now America would have had en...\n",
	"4 3 Yet the act is still charming here ."
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 32
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "G_1m48hB665J",
	"colab_type": "code",
	"outputId": "91361f5f-bc4e-461c-d9d2-6493b21fcd95",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 35
	}
	},
	"source": [
	"\"\"\"\n",
	"create word to index vocabulary dictionary\n",
	"\"\"\"\n",
	"\n",
	"word_2_indx = {}\n",
	"sentences = \" \".join(train_data.text.values)\n",
	"\n",
	"word_2_indx = dict([(y,x) for x,y in enumerate(set(sentences.split()))])\n",
	"print(\"Vocab Size : {}\".format(len(word_2_indx)))"
	],
	"execution_count": 33,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Vocab Size : 18278\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "oZduu1FL7zii",
	"colab_type": "code",
	"outputId": "7865f5d0-ee23-4644-ef5a-4e1a7254095a",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 53
	}
	},
	"source": [
	"vocab_size = len(word_2_indx)\n",
	"num_labels = len(set(train_data[\"label\"].values))\n",
	"print(\"Vocab Size : \", vocab_size)\n",
	"print(\"No. Labels : \", num_labels)"
	],
	"execution_count": 34,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Vocab Size : 18278\n",
	"No. Labels : 5\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "Clw37c5E8Ihs",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"\"\"\"\n",
	"Create a NN model\n",
	"\"\"\"\n",
	"\n",
	"class BOW(nn.Module):\n",
	"\n",
	" def __init__(self, num_labels, vocab_size):\n",
	" super(BOW, self).__init__()\n",
	"\n",
	" self.linear = nn.Linear(vocab_size, num_labels)\n",
	"\n",
	" def forward(self, bow_vec):\n",
	" return F.log_softmax(self.linear(bow_vec), dim=1)"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "dvEc4zQv-vnS",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"def make_bow_vec(sentence, word_2_indx):\n",
	" vec = torch.zeros(len(word_2_indx))\n",
	" for word in sentence.split():\n",
	" if word in word_2_indx:\n",
	" vec[word_2_indx[word]] += 1\n",
	"\n",
	" return vec.view(1,-1)"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "1Jka_JGN_O0P",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"def make_target(label, label_2_indx):\n",
	" return torch.LongTensor([label_2_indx[\"%s\"%label]])"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "bUbgIStG_hAN",
	"colab_type": "code",
	"outputId": "87eba8a8-e675-40fa-d7c2-cdda1360aa8e",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 71
	}
	},
	"source": [
	"model = BOW(num_labels=num_labels, vocab_size=vocab_size)\n",
	"model.to(device)"
	],
	"execution_count": 38,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"BOW(\n",
	" (linear): Linear(in_features=18278, out_features=5, bias=True)\n",
	")"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 38
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "fLyacxxL_q5l",
	"colab_type": "code",
	"outputId": "b66f57da-d4c0-4ba6-8240-c1bc5df00f75",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 197
	}
	},
	"source": [
	"for param in model.parameters():\n",
	" print(param)"
	],
	"execution_count": 39,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Parameter containing:\n",
	"tensor([[ 0.0070, 0.0040, 0.0071, ..., -0.0038, -0.0010, -0.0032],\n",
	" [ 0.0048, -0.0052, -0.0025, ..., -0.0071, -0.0069, 0.0045],\n",
	" [-0.0047, 0.0037, -0.0050, ..., -0.0037, 0.0021, -0.0046],\n",
	" [-0.0008, 0.0028, 0.0054, ..., 0.0034, -0.0069, -0.0021],\n",
	" [ 0.0045, -0.0035, 0.0029, ..., 0.0064, -0.0046, -0.0049]],\n",
	" device='cuda:0', requires_grad=True)\n",
	"Parameter containing:\n",
	"tensor([-0.0054, -0.0062, -0.0034, 0.0044, 0.0022], device='cuda:0',\n",
	" requires_grad=True)\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "ynemL_Tw_xvT",
	"colab_type": "code",
	"outputId": "9104553d-1138-4aad-d0bc-67b0d2e5df5c",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 35
	}
	},
	"source": [
	"with torch.no_grad():\n",
	" sample = train_data[\"text\"].iloc[0]\n",
	" bow_vector = make_bow_vec(sample, word_2_indx).to(device)\n",
	" log_probs = model(bow_vector)\n",
	" print(log_probs)"
	],
	"execution_count": 40,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"tensor([[-1.6320, -1.6246, -1.6060, -1.5794, -1.6061]], device='cuda:0')\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "XXxIOLYRAqZ4",
	"colab_type": "code",
	"outputId": "073bfa83-bfef-4de2-a7b3-62b7e14ba1ed",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 35
	}
	},
	"source": [
	"set(train_data[\"label\"].values)"
	],
	"execution_count": 41,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"{0, 1, 2, 3, 4}"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 41
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "iuMVzCFtAwFU",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"label_2_indx = {\"0\":0,\"1\":1,\"2\":2,\"3\":3,\"4\":4}"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "Hc2Yk-_cA5Ru",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"loss_function = nn.NLLLoss()\n",
	"optimizer = optim.Adam(model.parameters(), lr=0.1)"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "K5vxYzVDBG8-",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"data = train_data[[\"text\",\"label\"]].values\n",
	"t_data = test_data[[\"text\",\"label\"]].values"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "q63y2MJ4BONd",
	"colab_type": "code",
	"outputId": "c495274a-d81a-48a3-b412-b8b7b7c2555b",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 73
	}
	},
	"source": [
	"data[0]"
	],
	"execution_count": 50,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"array([\" The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .\",\n",
	" 3], dtype=object)"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 50
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "9eJf2Rm-BPJ2",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"import time\n",
	"import numpy as np"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "my0ln4sVBTA7",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 755
	},
	"outputId": "9b30d740-e407-43e5-ae32-5046e13a00cf"
	},
	"source": [
	"for epoch in range(10):\n",
	" start = time.time()\n",
	"\n",
	" total_loss = 0\n",
	"\n",
	" for instance in data:\n",
	" text = instance[0]\n",
	" label = instance[1]\n",
	"\n",
	" #pytorch accumulates data clear it out\n",
	" model.zero_grad()\n",
	"\n",
	" #make bow vector\n",
	" bow_vec = make_bow_vec(text, word_2_indx).to(device)\n",
	" target = make_target(label, label_2_indx).to(device)\n",
	"\n",
	" #run the forward pass\n",
	" prediction = model(bow_vec)\n",
	"\n",
	" #compute the loss, gradients, and update the parameters\n",
	" loss = loss_function(prediction, target)\n",
	" total_loss += loss.item()\n",
	" loss.backward()\n",
	" optimizer.step()\n",
	"\n",
	"\n",
	" #testing accuracy\n",
	" test_correct = 0.0\n",
	" for instance in t_data:\n",
	" model.eval()\n",
	" text = instance[0]\n",
	" label = instance[1]\n",
	"\n",
	" #Make BOW vector\n",
	" bow_vec = make_bow_vec(text, word_2_indx).to(device)\n",
	" target = make_target(label, label_2_indx).to(device)\n",
	"\n",
	" scores = model(bow_vec).detach().cpu().numpy()\n",
	"\n",
	" predict = np.argmax(scores)\n",
	"\n",
	" if predict==label:\n",
	" test_correct += 1\n",
	"\n",
	" print(\"Epoch {}\".format(epoch))\n",
	" print(\"Loss {}\".format(total_loss/len(data)))\n",
	" print(\"Testing Accuracy {}\".format(test_correct/len(t_data)))\n",
	" print(\"------\")\n",
	" model.train()\n",
	"\n",
	"\n",
	"print(\"Time Take : {}\", time.time()-start)"
	],
	"execution_count": 52,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Epoch 0\n",
	"Loss 8.314135334722788\n",
	"Testing Accuracy 0.3180995475113122\n",
	"------\n",
	"Epoch 1\n",
	"Loss 3.1771007349212352\n",
	"Testing Accuracy 0.33167420814479637\n",
	"------\n",
	"Epoch 2\n",
	"Loss 1.8018128684188208\n",
	"Testing Accuracy 0.3257918552036199\n",
	"------\n",
	"Epoch 3\n",
	"Loss 1.2535895682724842\n",
	"Testing Accuracy 0.3176470588235294\n",
	"------\n",
	"Epoch 4\n",
	"Loss 0.9204622364780876\n",
	"Testing Accuracy 0.32760180995475113\n",
	"------\n",
	"Epoch 5\n",
	"Loss 0.6450639376591207\n",
	"Testing Accuracy 0.3239819004524887\n",
	"------\n",
	"Epoch 6\n",
	"Loss 0.5178571962908413\n",
	"Testing Accuracy 0.3095022624434389\n",
	"------\n",
	"Epoch 7\n",
	"Loss 0.3698097968592626\n",
	"Testing Accuracy 0.31040723981900453\n",
	"------\n",
	"Epoch 8\n",
	"Loss 0.32668769638636586\n",
	"Testing Accuracy 0.304524886877828\n",
	"------\n",
	"Epoch 9\n",
	"Loss 0.29391382461630006\n",
	"Testing Accuracy 0.31085972850678734\n",
	"------\n",
	"Time Take : {} 11.080454587936401\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "zgNN54tnDUcw",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	""
	],
	"execution_count": 0,
	"outputs": []
	}
	]
	}