Created
August 14, 2019 04:25
-
-
Save myselfHimanshu/92c7a5d0352364accf3a1959338fbfe9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Lec1-BOW.ipynb", | |
"version": "0.3.2", | |
"provenance": [], | |
"collapsed_sections": [] | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"accelerator": "GPU" | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "odL47DwX36L-", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## Intro to Neural Networks for NLP\n", | |
"\n", | |
"Labels = {\n", | |
" \"very_bad\":0,\n", | |
" \"bad\":1,\n", | |
" \"neutral\":2,\n", | |
" \"good\":3,\n", | |
" \"very_good\":4\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "53ztcfps3t-N", | |
"colab_type": "code", | |
"outputId": "c3ce8900-7352-4998-941e-9abecea760b5", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
} | |
}, | |
"source": [ | |
"\"\"\"\n", | |
"mount google drive, change directory and download the dataset\n", | |
"\"\"\"\n", | |
"\n", | |
"from google.colab import drive\n", | |
"drive.mount(\"/content/drive\")" | |
], | |
"execution_count": 27, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "LT8yhQUe4oqU", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"import os\n", | |
"os.chdir(\"./drive/My Drive/CMUNN4NLP\")" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "bs0N3kru5dXk", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"\"\"\"\n", | |
"download the files\n", | |
"\"\"\"\n", | |
"# !wget -c https://github.com/neubig/nn4nlp-code/raw/master/data/classes/dev.txt\n", | |
"# !wget -c https://github.com/neubig/nn4nlp-code/raw/master/data/classes/test.txt\n", | |
"# !wget -c https://github.com/neubig/nn4nlp-code/raw/master/data/classes/train.txt" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "VVeUmUUw55Vs", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## Implementation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "auLTNJX55ypW", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"import torch\n", | |
"import torch.nn as nn\n", | |
"import torch.nn.functional as F\n", | |
"import torch.optim as optim" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "_QyEUk6P6D_g", | |
"colab_type": "code", | |
"outputId": "d4f24e52-01b0-409f-ac0f-29de3424a185", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
} | |
}, | |
"source": [ | |
"use_cuda = True\n", | |
"print(\"CUDA Available : \", torch.cuda.is_available())\n", | |
"device = torch.device(\"cuda\" if (use_cuda and torch.cuda.is_available()) else \"cpu\")" | |
], | |
"execution_count": 30, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"CUDA Available : True\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "zqKy9rz26Zic", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"#read the datafiles\n", | |
"import pandas as pd\n", | |
"\n", | |
"train_data = pd.read_csv(\"./train.txt\", delimiter=\"|\", names=['label','remove','removel','text']).drop([\"remove\",\"removel\"], axis=1)\n", | |
"test_data = pd.read_csv(\"./test.txt\", delimiter=\"|\", names=['label','remove','removel','text']).drop([\"remove\",\"removel\"], axis=1)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "q6eNvg_Q6yKz", | |
"colab_type": "code", | |
"outputId": "cced2e27-c47a-40fa-8c8e-a4e1531a7a65", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 204 | |
} | |
}, | |
"source": [ | |
"train_data.head()" | |
], | |
"execution_count": 32, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>label</th>\n", | |
" <th>text</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>3</td>\n", | |
" <td>The Rock is destined to be the 21st Century '...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>4</td>\n", | |
" <td>The gorgeously elaborate continuation of `` T...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>Singer\\/composer Bryan Adams contributes a sl...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2</td>\n", | |
" <td>You 'd think by now America would have had en...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>3</td>\n", | |
" <td>Yet the act is still charming here .</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" label text\n", | |
"0 3 The Rock is destined to be the 21st Century '...\n", | |
"1 4 The gorgeously elaborate continuation of `` T...\n", | |
"2 3 Singer\\/composer Bryan Adams contributes a sl...\n", | |
"3 2 You 'd think by now America would have had en...\n", | |
"4 3 Yet the act is still charming here ." | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 32 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "G_1m48hB665J", | |
"colab_type": "code", | |
"outputId": "91361f5f-bc4e-461c-d9d2-6493b21fcd95", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
} | |
}, | |
"source": [ | |
"\"\"\"\n", | |
"create word to index vocabulary dictionary\n", | |
"\"\"\"\n", | |
"\n", | |
"word_2_indx = {}\n", | |
"sentences = \" \".join(train_data.text.values)\n", | |
"\n", | |
"word_2_indx = dict([(y,x) for x,y in enumerate(set(sentences.split()))])\n", | |
"print(\"Vocab Size : {}\".format(len(word_2_indx)))" | |
], | |
"execution_count": 33, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Vocab Size : 18278\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "oZduu1FL7zii", | |
"colab_type": "code", | |
"outputId": "7865f5d0-ee23-4644-ef5a-4e1a7254095a", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 53 | |
} | |
}, | |
"source": [ | |
"vocab_size = len(word_2_indx)\n", | |
"num_labels = len(set(train_data[\"label\"].values))\n", | |
"print(\"Vocab Size : \", vocab_size)\n", | |
"print(\"No. Labels : \", num_labels)" | |
], | |
"execution_count": 34, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Vocab Size : 18278\n", | |
"No. Labels : 5\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Clw37c5E8Ihs", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"\"\"\"\n", | |
"Create a NN model\n", | |
"\"\"\"\n", | |
"\n", | |
"class BOW(nn.Module):\n", | |
"\n", | |
" def __init__(self, num_labels, vocab_size):\n", | |
" super(BOW, self).__init__()\n", | |
"\n", | |
" self.linear = nn.Linear(vocab_size, num_labels)\n", | |
"\n", | |
" def forward(self, bow_vec):\n", | |
" return F.log_softmax(self.linear(bow_vec), dim=1)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "dvEc4zQv-vnS", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"def make_bow_vec(sentence, word_2_indx):\n", | |
" vec = torch.zeros(len(word_2_indx))\n", | |
" for word in sentence.split():\n", | |
" if word in word_2_indx:\n", | |
" vec[word_2_indx[word]] += 1\n", | |
"\n", | |
" return vec.view(1,-1)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "1Jka_JGN_O0P", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"def make_target(label, label_2_indx):\n", | |
" return torch.LongTensor([label_2_indx[\"%s\"%label]])" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "bUbgIStG_hAN", | |
"colab_type": "code", | |
"outputId": "87eba8a8-e675-40fa-d7c2-cdda1360aa8e", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 71 | |
} | |
}, | |
"source": [ | |
"model = BOW(num_labels=num_labels, vocab_size=vocab_size)\n", | |
"model.to(device)" | |
], | |
"execution_count": 38, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"BOW(\n", | |
" (linear): Linear(in_features=18278, out_features=5, bias=True)\n", | |
")" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 38 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "fLyacxxL_q5l", | |
"colab_type": "code", | |
"outputId": "b66f57da-d4c0-4ba6-8240-c1bc5df00f75", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 197 | |
} | |
}, | |
"source": [ | |
"for param in model.parameters():\n", | |
" print(param)" | |
], | |
"execution_count": 39, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Parameter containing:\n", | |
"tensor([[ 0.0070, 0.0040, 0.0071, ..., -0.0038, -0.0010, -0.0032],\n", | |
" [ 0.0048, -0.0052, -0.0025, ..., -0.0071, -0.0069, 0.0045],\n", | |
" [-0.0047, 0.0037, -0.0050, ..., -0.0037, 0.0021, -0.0046],\n", | |
" [-0.0008, 0.0028, 0.0054, ..., 0.0034, -0.0069, -0.0021],\n", | |
" [ 0.0045, -0.0035, 0.0029, ..., 0.0064, -0.0046, -0.0049]],\n", | |
" device='cuda:0', requires_grad=True)\n", | |
"Parameter containing:\n", | |
"tensor([-0.0054, -0.0062, -0.0034, 0.0044, 0.0022], device='cuda:0',\n", | |
" requires_grad=True)\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ynemL_Tw_xvT", | |
"colab_type": "code", | |
"outputId": "9104553d-1138-4aad-d0bc-67b0d2e5df5c", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
} | |
}, | |
"source": [ | |
"with torch.no_grad():\n", | |
" sample = train_data[\"text\"].iloc[0]\n", | |
" bow_vector = make_bow_vec(sample, word_2_indx).to(device)\n", | |
" log_probs = model(bow_vector)\n", | |
" print(log_probs)" | |
], | |
"execution_count": 40, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"tensor([[-1.6320, -1.6246, -1.6060, -1.5794, -1.6061]], device='cuda:0')\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "XXxIOLYRAqZ4", | |
"colab_type": "code", | |
"outputId": "073bfa83-bfef-4de2-a7b3-62b7e14ba1ed", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
} | |
}, | |
"source": [ | |
"set(train_data[\"label\"].values)" | |
], | |
"execution_count": 41, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"{0, 1, 2, 3, 4}" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 41 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "iuMVzCFtAwFU", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"label_2_indx = {\"0\":0,\"1\":1,\"2\":2,\"3\":3,\"4\":4}" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Hc2Yk-_cA5Ru", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"loss_function = nn.NLLLoss()\n", | |
"optimizer = optim.Adam(model.parameters(), lr=0.1)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "K5vxYzVDBG8-", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"data = train_data[[\"text\",\"label\"]].values\n", | |
"t_data = test_data[[\"text\",\"label\"]].values" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "q63y2MJ4BONd", | |
"colab_type": "code", | |
"outputId": "c495274a-d81a-48a3-b412-b8b7b7c2555b", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 73 | |
} | |
}, | |
"source": [ | |
"data[0]" | |
], | |
"execution_count": 50, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"array([\" The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .\",\n", | |
" 3], dtype=object)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 50 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "9eJf2Rm-BPJ2", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"import time\n", | |
"import numpy as np" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "my0ln4sVBTA7", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 755 | |
}, | |
"outputId": "9b30d740-e407-43e5-ae32-5046e13a00cf" | |
}, | |
"source": [ | |
"for epoch in range(10):\n", | |
" start = time.time()\n", | |
"\n", | |
" total_loss = 0\n", | |
"\n", | |
" for instance in data:\n", | |
" text = instance[0]\n", | |
" label = instance[1]\n", | |
"\n", | |
" #pytorch accumulates data clear it out\n", | |
" model.zero_grad()\n", | |
"\n", | |
" #make bow vector\n", | |
" bow_vec = make_bow_vec(text, word_2_indx).to(device)\n", | |
" target = make_target(label, label_2_indx).to(device)\n", | |
"\n", | |
" #run the forward pass\n", | |
" prediction = model(bow_vec)\n", | |
"\n", | |
" #compute the loss, gradients, and update the parameters\n", | |
" loss = loss_function(prediction, target)\n", | |
" total_loss += loss.item()\n", | |
" loss.backward()\n", | |
" optimizer.step()\n", | |
"\n", | |
"\n", | |
" #testing accuracy\n", | |
" test_correct = 0.0\n", | |
" for instance in t_data:\n", | |
" model.eval()\n", | |
" text = instance[0]\n", | |
" label = instance[1]\n", | |
"\n", | |
" #Make BOW vector\n", | |
" bow_vec = make_bow_vec(text, word_2_indx).to(device)\n", | |
" target = make_target(label, label_2_indx).to(device)\n", | |
"\n", | |
" scores = model(bow_vec).detach().cpu().numpy()\n", | |
"\n", | |
" predict = np.argmax(scores)\n", | |
"\n", | |
" if predict==label:\n", | |
" test_correct += 1\n", | |
"\n", | |
" print(\"Epoch {}\".format(epoch))\n", | |
" print(\"Loss {}\".format(total_loss/len(data)))\n", | |
" print(\"Testing Accuracy {}\".format(test_correct/len(t_data)))\n", | |
" print(\"------\")\n", | |
" model.train()\n", | |
"\n", | |
"\n", | |
"print(\"Time Take : {}\", time.time()-start)" | |
], | |
"execution_count": 52, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Epoch 0\n", | |
"Loss 8.314135334722788\n", | |
"Testing Accuracy 0.3180995475113122\n", | |
"------\n", | |
"Epoch 1\n", | |
"Loss 3.1771007349212352\n", | |
"Testing Accuracy 0.33167420814479637\n", | |
"------\n", | |
"Epoch 2\n", | |
"Loss 1.8018128684188208\n", | |
"Testing Accuracy 0.3257918552036199\n", | |
"------\n", | |
"Epoch 3\n", | |
"Loss 1.2535895682724842\n", | |
"Testing Accuracy 0.3176470588235294\n", | |
"------\n", | |
"Epoch 4\n", | |
"Loss 0.9204622364780876\n", | |
"Testing Accuracy 0.32760180995475113\n", | |
"------\n", | |
"Epoch 5\n", | |
"Loss 0.6450639376591207\n", | |
"Testing Accuracy 0.3239819004524887\n", | |
"------\n", | |
"Epoch 6\n", | |
"Loss 0.5178571962908413\n", | |
"Testing Accuracy 0.3095022624434389\n", | |
"------\n", | |
"Epoch 7\n", | |
"Loss 0.3698097968592626\n", | |
"Testing Accuracy 0.31040723981900453\n", | |
"------\n", | |
"Epoch 8\n", | |
"Loss 0.32668769638636586\n", | |
"Testing Accuracy 0.304524886877828\n", | |
"------\n", | |
"Epoch 9\n", | |
"Loss 0.29391382461630006\n", | |
"Testing Accuracy 0.31085972850678734\n", | |
"------\n", | |
"Time Take : {} 11.080454587936401\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "zgNN54tnDUcw", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment