Skip to content

Instantly share code, notes, and snippets.

@myselfHimanshu
Created August 14, 2019 04:25
Show Gist options
  • Save myselfHimanshu/92c7a5d0352364accf3a1959338fbfe9 to your computer and use it in GitHub Desktop.
Save myselfHimanshu/92c7a5d0352364accf3a1959338fbfe9 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Lec1-BOW.ipynb",
"version": "0.3.2",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "odL47DwX36L-",
"colab_type": "text"
},
"source": [
"## Intro to Neural Networks for NLP\n",
"\n",
"Labels = {\n",
" \"very_bad\":0,\n",
" \"bad\":1,\n",
" \"neutral\":2,\n",
" \"good\":3,\n",
" \"very_good\":4\n",
"}"
]
},
{
"cell_type": "code",
"metadata": {
"id": "53ztcfps3t-N",
"colab_type": "code",
"outputId": "c3ce8900-7352-4998-941e-9abecea760b5",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
}
},
"source": [
"\"\"\"\n",
"mount google drive, change directory and download the dataset\n",
"\"\"\"\n",
"\n",
"from google.colab import drive\n",
"drive.mount(\"/content/drive\")"
],
"execution_count": 27,
"outputs": [
{
"output_type": "stream",
"text": [
"Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "LT8yhQUe4oqU",
"colab_type": "code",
"colab": {}
},
"source": [
"import os\n",
"os.chdir(\"./drive/My Drive/CMUNN4NLP\")"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "bs0N3kru5dXk",
"colab_type": "code",
"colab": {}
},
"source": [
"\"\"\"\n",
"download the files\n",
"\"\"\"\n",
"# !wget -c https://github.com/neubig/nn4nlp-code/raw/master/data/classes/dev.txt\n",
"# !wget -c https://github.com/neubig/nn4nlp-code/raw/master/data/classes/test.txt\n",
"# !wget -c https://github.com/neubig/nn4nlp-code/raw/master/data/classes/train.txt"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "VVeUmUUw55Vs",
"colab_type": "text"
},
"source": [
"## Implementation"
]
},
{
"cell_type": "code",
"metadata": {
"id": "auLTNJX55ypW",
"colab_type": "code",
"colab": {}
},
"source": [
"import torch\n",
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"import torch.optim as optim"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "_QyEUk6P6D_g",
"colab_type": "code",
"outputId": "d4f24e52-01b0-409f-ac0f-29de3424a185",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
}
},
"source": [
"use_cuda = True\n",
"print(\"CUDA Available : \", torch.cuda.is_available())\n",
"device = torch.device(\"cuda\" if (use_cuda and torch.cuda.is_available()) else \"cpu\")"
],
"execution_count": 30,
"outputs": [
{
"output_type": "stream",
"text": [
"CUDA Available : True\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "zqKy9rz26Zic",
"colab_type": "code",
"colab": {}
},
"source": [
"#read the datafiles\n",
"import pandas as pd\n",
"\n",
"train_data = pd.read_csv(\"./train.txt\", delimiter=\"|\", names=['label','remove','removel','text']).drop([\"remove\",\"removel\"], axis=1)\n",
"test_data = pd.read_csv(\"./test.txt\", delimiter=\"|\", names=['label','remove','removel','text']).drop([\"remove\",\"removel\"], axis=1)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "q6eNvg_Q6yKz",
"colab_type": "code",
"outputId": "cced2e27-c47a-40fa-8c8e-a4e1531a7a65",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
}
},
"source": [
"train_data.head()"
],
"execution_count": 32,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label</th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>The Rock is destined to be the 21st Century '...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4</td>\n",
" <td>The gorgeously elaborate continuation of `` T...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Singer\\/composer Bryan Adams contributes a sl...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>You 'd think by now America would have had en...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3</td>\n",
" <td>Yet the act is still charming here .</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" label text\n",
"0 3 The Rock is destined to be the 21st Century '...\n",
"1 4 The gorgeously elaborate continuation of `` T...\n",
"2 3 Singer\\/composer Bryan Adams contributes a sl...\n",
"3 2 You 'd think by now America would have had en...\n",
"4 3 Yet the act is still charming here ."
]
},
"metadata": {
"tags": []
},
"execution_count": 32
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "G_1m48hB665J",
"colab_type": "code",
"outputId": "91361f5f-bc4e-461c-d9d2-6493b21fcd95",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
}
},
"source": [
"\"\"\"\n",
"create word to index vocabulary dictionary\n",
"\"\"\"\n",
"\n",
"word_2_indx = {}\n",
"sentences = \" \".join(train_data.text.values)\n",
"\n",
"word_2_indx = dict([(y,x) for x,y in enumerate(set(sentences.split()))])\n",
"print(\"Vocab Size : {}\".format(len(word_2_indx)))"
],
"execution_count": 33,
"outputs": [
{
"output_type": "stream",
"text": [
"Vocab Size : 18278\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "oZduu1FL7zii",
"colab_type": "code",
"outputId": "7865f5d0-ee23-4644-ef5a-4e1a7254095a",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 53
}
},
"source": [
"vocab_size = len(word_2_indx)\n",
"num_labels = len(set(train_data[\"label\"].values))\n",
"print(\"Vocab Size : \", vocab_size)\n",
"print(\"No. Labels : \", num_labels)"
],
"execution_count": 34,
"outputs": [
{
"output_type": "stream",
"text": [
"Vocab Size : 18278\n",
"No. Labels : 5\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Clw37c5E8Ihs",
"colab_type": "code",
"colab": {}
},
"source": [
"\"\"\"\n",
"Create a NN model\n",
"\"\"\"\n",
"\n",
"class BOW(nn.Module):\n",
"\n",
" def __init__(self, num_labels, vocab_size):\n",
" super(BOW, self).__init__()\n",
"\n",
" self.linear = nn.Linear(vocab_size, num_labels)\n",
"\n",
" def forward(self, bow_vec):\n",
" return F.log_softmax(self.linear(bow_vec), dim=1)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "dvEc4zQv-vnS",
"colab_type": "code",
"colab": {}
},
"source": [
"def make_bow_vec(sentence, word_2_indx):\n",
" vec = torch.zeros(len(word_2_indx))\n",
" for word in sentence.split():\n",
" if word in word_2_indx:\n",
" vec[word_2_indx[word]] += 1\n",
"\n",
" return vec.view(1,-1)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "1Jka_JGN_O0P",
"colab_type": "code",
"colab": {}
},
"source": [
"def make_target(label, label_2_indx):\n",
" return torch.LongTensor([label_2_indx[\"%s\"%label]])"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "bUbgIStG_hAN",
"colab_type": "code",
"outputId": "87eba8a8-e675-40fa-d7c2-cdda1360aa8e",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 71
}
},
"source": [
"model = BOW(num_labels=num_labels, vocab_size=vocab_size)\n",
"model.to(device)"
],
"execution_count": 38,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"BOW(\n",
" (linear): Linear(in_features=18278, out_features=5, bias=True)\n",
")"
]
},
"metadata": {
"tags": []
},
"execution_count": 38
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "fLyacxxL_q5l",
"colab_type": "code",
"outputId": "b66f57da-d4c0-4ba6-8240-c1bc5df00f75",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 197
}
},
"source": [
"for param in model.parameters():\n",
" print(param)"
],
"execution_count": 39,
"outputs": [
{
"output_type": "stream",
"text": [
"Parameter containing:\n",
"tensor([[ 0.0070, 0.0040, 0.0071, ..., -0.0038, -0.0010, -0.0032],\n",
" [ 0.0048, -0.0052, -0.0025, ..., -0.0071, -0.0069, 0.0045],\n",
" [-0.0047, 0.0037, -0.0050, ..., -0.0037, 0.0021, -0.0046],\n",
" [-0.0008, 0.0028, 0.0054, ..., 0.0034, -0.0069, -0.0021],\n",
" [ 0.0045, -0.0035, 0.0029, ..., 0.0064, -0.0046, -0.0049]],\n",
" device='cuda:0', requires_grad=True)\n",
"Parameter containing:\n",
"tensor([-0.0054, -0.0062, -0.0034, 0.0044, 0.0022], device='cuda:0',\n",
" requires_grad=True)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "ynemL_Tw_xvT",
"colab_type": "code",
"outputId": "9104553d-1138-4aad-d0bc-67b0d2e5df5c",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
}
},
"source": [
"with torch.no_grad():\n",
" sample = train_data[\"text\"].iloc[0]\n",
" bow_vector = make_bow_vec(sample, word_2_indx).to(device)\n",
" log_probs = model(bow_vector)\n",
" print(log_probs)"
],
"execution_count": 40,
"outputs": [
{
"output_type": "stream",
"text": [
"tensor([[-1.6320, -1.6246, -1.6060, -1.5794, -1.6061]], device='cuda:0')\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "XXxIOLYRAqZ4",
"colab_type": "code",
"outputId": "073bfa83-bfef-4de2-a7b3-62b7e14ba1ed",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
}
},
"source": [
"set(train_data[\"label\"].values)"
],
"execution_count": 41,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{0, 1, 2, 3, 4}"
]
},
"metadata": {
"tags": []
},
"execution_count": 41
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "iuMVzCFtAwFU",
"colab_type": "code",
"colab": {}
},
"source": [
"label_2_indx = {\"0\":0,\"1\":1,\"2\":2,\"3\":3,\"4\":4}"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Hc2Yk-_cA5Ru",
"colab_type": "code",
"colab": {}
},
"source": [
"loss_function = nn.NLLLoss()\n",
"optimizer = optim.Adam(model.parameters(), lr=0.1)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "K5vxYzVDBG8-",
"colab_type": "code",
"colab": {}
},
"source": [
"data = train_data[[\"text\",\"label\"]].values\n",
"t_data = test_data[[\"text\",\"label\"]].values"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "q63y2MJ4BONd",
"colab_type": "code",
"outputId": "c495274a-d81a-48a3-b412-b8b7b7c2555b",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 73
}
},
"source": [
"data[0]"
],
"execution_count": 50,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([\" The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .\",\n",
" 3], dtype=object)"
]
},
"metadata": {
"tags": []
},
"execution_count": 50
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "9eJf2Rm-BPJ2",
"colab_type": "code",
"colab": {}
},
"source": [
"import time\n",
"import numpy as np"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "my0ln4sVBTA7",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 755
},
"outputId": "9b30d740-e407-43e5-ae32-5046e13a00cf"
},
"source": [
"for epoch in range(10):\n",
" start = time.time()\n",
"\n",
" total_loss = 0\n",
"\n",
" for instance in data:\n",
" text = instance[0]\n",
" label = instance[1]\n",
"\n",
" #pytorch accumulates data clear it out\n",
" model.zero_grad()\n",
"\n",
" #make bow vector\n",
" bow_vec = make_bow_vec(text, word_2_indx).to(device)\n",
" target = make_target(label, label_2_indx).to(device)\n",
"\n",
" #run the forward pass\n",
" prediction = model(bow_vec)\n",
"\n",
" #compute the loss, gradients, and update the parameters\n",
" loss = loss_function(prediction, target)\n",
" total_loss += loss.item()\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
"\n",
" #testing accuracy\n",
" test_correct = 0.0\n",
" for instance in t_data:\n",
" model.eval()\n",
" text = instance[0]\n",
" label = instance[1]\n",
"\n",
" #Make BOW vector\n",
" bow_vec = make_bow_vec(text, word_2_indx).to(device)\n",
" target = make_target(label, label_2_indx).to(device)\n",
"\n",
" scores = model(bow_vec).detach().cpu().numpy()\n",
"\n",
" predict = np.argmax(scores)\n",
"\n",
" if predict==label:\n",
" test_correct += 1\n",
"\n",
" print(\"Epoch {}\".format(epoch))\n",
" print(\"Loss {}\".format(total_loss/len(data)))\n",
" print(\"Testing Accuracy {}\".format(test_correct/len(t_data)))\n",
" print(\"------\")\n",
" model.train()\n",
"\n",
"\n",
"print(\"Time Take : {}\", time.time()-start)"
],
"execution_count": 52,
"outputs": [
{
"output_type": "stream",
"text": [
"Epoch 0\n",
"Loss 8.314135334722788\n",
"Testing Accuracy 0.3180995475113122\n",
"------\n",
"Epoch 1\n",
"Loss 3.1771007349212352\n",
"Testing Accuracy 0.33167420814479637\n",
"------\n",
"Epoch 2\n",
"Loss 1.8018128684188208\n",
"Testing Accuracy 0.3257918552036199\n",
"------\n",
"Epoch 3\n",
"Loss 1.2535895682724842\n",
"Testing Accuracy 0.3176470588235294\n",
"------\n",
"Epoch 4\n",
"Loss 0.9204622364780876\n",
"Testing Accuracy 0.32760180995475113\n",
"------\n",
"Epoch 5\n",
"Loss 0.6450639376591207\n",
"Testing Accuracy 0.3239819004524887\n",
"------\n",
"Epoch 6\n",
"Loss 0.5178571962908413\n",
"Testing Accuracy 0.3095022624434389\n",
"------\n",
"Epoch 7\n",
"Loss 0.3698097968592626\n",
"Testing Accuracy 0.31040723981900453\n",
"------\n",
"Epoch 8\n",
"Loss 0.32668769638636586\n",
"Testing Accuracy 0.304524886877828\n",
"------\n",
"Epoch 9\n",
"Loss 0.29391382461630006\n",
"Testing Accuracy 0.31085972850678734\n",
"------\n",
"Time Take : {} 11.080454587936401\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "zgNN54tnDUcw",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": 0,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment