Last active
March 15, 2020 14:33
-
-
Save patternproject/1dfa8a1b301053c2a03e974cbb5d1193 to your computer and use it in GitHub Desktop.
Wk2_Submission.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Wk2_Submission.ipynb", | |
"provenance": [], | |
"toc_visible": true, | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/patternproject/1dfa8a1b301053c2a03e974cbb5d1193/wk2_submission.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "QazQ6ZyQR03W", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Manning LP \n", | |
"\"Classifying Customer Feedback with Imbalanced Text Data\"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "l8E0u3Ptl1gF", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Week 2 - Over Sampling" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "TnLbS4_ubvC0", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"\n", | |
"\n", | |
"# 1.Import Libraries" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "IEhr9gjObxBN", | |
"colab_type": "code", | |
"outputId": "31980733-8c41-4ffa-8940-4416b85ec2d9", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 81 | |
} | |
}, | |
"source": [ | |
"# Deep Learning Stuff\n", | |
"from __future__ import absolute_import, division, print_function, unicode_literals\n", | |
"from tensorflow.keras import layers\n", | |
"\n", | |
"from keras.models import Sequential\n", | |
"from keras.layers import Dense\n", | |
"from keras.layers import Flatten\n", | |
"from keras.layers.embeddings import Embedding\n", | |
"from keras.preprocessing import sequence\n", | |
"\n", | |
"# TF\n", | |
"import tensorflow as tf\n", | |
"\n", | |
"# basics\n", | |
"import os\n", | |
"import numpy as np\n", | |
"import matplotlib.pyplot as plt\n", | |
"plt.style.use('ggplot')\n", | |
"\n", | |
"# for confusion matrix\n", | |
"from sklearn.metrics import classification_report,confusion_matrix\n", | |
"from sklearn import metrics\n", | |
"\n", | |
"from sklearn.metrics import accuracy_score\n", | |
"from sklearn.metrics import precision_score\n", | |
"from sklearn.metrics import recall_score\n", | |
"from sklearn.metrics import f1_score\n", | |
"from sklearn.metrics import confusion_matrix\n" | |
], | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/html": [ | |
"<p style=\"color: red;\">\n", | |
"The default version of TensorFlow in Colab will soon switch to TensorFlow 2.x.<br>\n", | |
"We recommend you <a href=\"https://www.tensorflow.org/guide/migrate\" target=\"_blank\">upgrade</a> now \n", | |
"or ensure your notebook will continue to use TensorFlow 1.x via the <code>%tensorflow_version 1.x</code> magic:\n", | |
"<a href=\"https://colab.research.google.com/notebooks/tensorflow_version.ipynb\" target=\"_blank\">more info</a>.</p>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
}, | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Using TensorFlow backend.\n" | |
], | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "fEzDs82C_dHu", | |
"colab_type": "code", | |
"outputId": "ce2023a0-c04a-40d1-868f-6ed9906412ee", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 640 | |
} | |
}, | |
"source": [ | |
"pip install tensorflow-gpu==2.0.0-rc0" | |
], | |
"execution_count": 2, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Collecting tensorflow-gpu==2.0.0-rc0\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/6a/12/8c64cc62149cc21c70c55018502831bbf4d42bd62bed196df7de6830d21b/tensorflow_gpu-2.0.0rc0-cp36-cp36m-manylinux2010_x86_64.whl (380.5MB)\n", | |
"\u001b[K |████████████████████████████████| 380.5MB 41kB/s \n", | |
"\u001b[?25hRequirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow-gpu==2.0.0-rc0) (1.27.1)\n", | |
"Collecting tb-nightly<1.15.0a20190807,>=1.15.0a20190806\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/bc/88/24b5fb7280e74c7cf65bde47c171547fd02afb3840cff41bcbe9270650f5/tb_nightly-1.15.0a20190806-py3-none-any.whl (4.3MB)\n", | |
"\u001b[K |████████████████████████████████| 4.3MB 41.2MB/s \n", | |
"\u001b[?25hRequirement already satisfied: astor>=0.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-gpu==2.0.0-rc0) (0.8.1)\n", | |
"Requirement already satisfied: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from tensorflow-gpu==2.0.0-rc0) (1.1.0)\n", | |
"Collecting tf-estimator-nightly<1.14.0.dev2019080602,>=1.14.0.dev2019080601\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/21/28/f2a27a62943d5f041e4a6fd404b2d21cb7c59b2242a4e73b03d9ba166552/tf_estimator_nightly-1.14.0.dev2019080601-py2.py3-none-any.whl (501kB)\n", | |
"\u001b[K |████████████████████████████████| 501kB 41.3MB/s \n", | |
"\u001b[?25hRequirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.6/dist-packages (from tensorflow-gpu==2.0.0-rc0) (0.34.2)\n", | |
"Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-gpu==2.0.0-rc0) (1.12.0)\n", | |
"Requirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow-gpu==2.0.0-rc0) (1.11.2)\n", | |
"Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-gpu==2.0.0-rc0) (1.1.0)\n", | |
"Requirement already satisfied: gast>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-gpu==2.0.0-rc0) (0.2.2)\n", | |
"Requirement already satisfied: numpy<2.0,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-gpu==2.0.0-rc0) (1.17.5)\n", | |
"Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow-gpu==2.0.0-rc0) (3.1.0)\n", | |
"Requirement already satisfied: keras-applications>=1.0.8 in /usr/local/lib/python3.6/dist-packages (from tensorflow-gpu==2.0.0-rc0) (1.0.8)\n", | |
"Requirement already satisfied: google-pasta>=0.1.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow-gpu==2.0.0-rc0) (0.1.8)\n", | |
"Requirement already satisfied: protobuf>=3.6.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow-gpu==2.0.0-rc0) (3.10.0)\n", | |
"Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-gpu==2.0.0-rc0) (0.9.0)\n", | |
"Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<1.15.0a20190807,>=1.15.0a20190806->tensorflow-gpu==2.0.0-rc0) (1.0.0)\n", | |
"Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<1.15.0a20190807,>=1.15.0a20190806->tensorflow-gpu==2.0.0-rc0) (3.2.1)\n", | |
"Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<1.15.0a20190807,>=1.15.0a20190806->tensorflow-gpu==2.0.0-rc0) (45.2.0)\n", | |
"Requirement already satisfied: h5py in /usr/local/lib/python3.6/dist-packages (from keras-applications>=1.0.8->tensorflow-gpu==2.0.0-rc0) (2.8.0)\n", | |
"Installing collected packages: tb-nightly, tf-estimator-nightly, tensorflow-gpu\n", | |
"Successfully installed tb-nightly-1.15.0a20190806 tensorflow-gpu-2.0.0rc0 tf-estimator-nightly-1.14.0.dev2019080601\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"application/vnd.colab-display-data+json": { | |
"pip_warning": { | |
"packages": [ | |
"tensorboard", | |
"tensorflow", | |
"tensorflow_core", | |
"tensorflow_estimator" | |
] | |
} | |
} | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "1ZdS5rXK_hi6", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Check the Tensorflow version installed." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "DQzffr3F_j4P", | |
"colab_type": "code", | |
"outputId": "64090d87-0a0b-44fc-fe3e-88910c333e45", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"import tensorflow as tf\n", | |
"print(tf.__version__)" | |
], | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"1.15.0\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Wh5bzmvFeQiy", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# 2.Load Data" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "b7wst2p-_zA1", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Load the IMDB review data as numpy array. The dataset is nicely split into training and test, and then into data (`x`) and label (`y`)." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "QEyfIT25_5tm", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 52 | |
}, | |
"outputId": "4ef1fc7e-ea08-44ec-faab-94e95ddd8ec3" | |
}, | |
"source": [ | |
"(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(\n", | |
" path='imdb.npz',\n", | |
" num_words=None,\n", | |
" skip_top=0,\n", | |
" maxlen=None,\n", | |
" seed=113,\n", | |
" start_char=1,\n", | |
" oov_char=2,\n", | |
" index_from=3\n", | |
")" | |
], | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz\n", | |
"17465344/17464789 [==============================] - 0s 0us/step\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "vBKlOT_-pQcD", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# 3.Explore Data" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "i3b08buIATM5", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Examine the data type with type command." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "RjOG2xVfsjIO", | |
"colab_type": "code", | |
"outputId": "cf5b2678-44e6-4e82-d13b-f628bd1eb957", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"type(x_train)" | |
], | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"numpy.ndarray" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 5 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "A0sVgExDAdTe", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Examine data structure with numpy's shape command" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "7zPPILlYAekD", | |
"colab_type": "code", | |
"outputId": "6ab29bc5-569e-44c7-8972-949847b3f587", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"x_train.shape" | |
], | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(25000,)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 6 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "ctjvEtgeAiwY", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Let us take a look at the content." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "aubigeBqAmwG", | |
"colab_type": "code", | |
"outputId": "d11451b8-f170-427b-9bac-53e04333f6d6", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 178 | |
} | |
}, | |
"source": [ | |
"x_train" | |
], | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),\n", | |
" list([1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463, 4369, 5012, 134, 26, 4, 715, 8, 118, 1634, 14, 394, 20, 13, 119, 954, 189, 102, 5, 207, 110, 3103, 21, 14, 69, 188, 8, 30, 23, 7, 4, 249, 126, 93, 4, 114, 9, 2300, 1523, 5, 647, 4, 116, 9, 35, 8163, 4, 229, 9, 340, 1322, 4, 118, 9, 4, 130, 4901, 19, 4, 1002, 5, 89, 29, 952, 46, 37, 4, 455, 9, 45, 43, 38, 1543, 1905, 398, 4, 1649, 26, 6853, 5, 163, 11, 3215, 10156, 4, 1153, 9, 194, 775, 7, 8255, 11596, 349, 2637, 148, 605, 15358, 8003, 15, 123, 125, 68, 23141, 6853, 15, 349, 165, 4362, 98, 5, 4, 228, 9, 43, 36893, 1157, 15, 299, 120, 5, 120, 174, 11, 220, 175, 136, 50, 9, 4373, 228, 8255, 5, 25249, 656, 245, 2350, 5, 4, 9837, 131, 152, 491, 18, 46151, 32, 7464, 1212, 14, 9, 6, 371, 78, 22, 625, 64, 1382, 9, 8, 168, 145, 23, 4, 1690, 15, 16, 4, 1355, 5, 28, 6, 52, 154, 462, 33, 89, 78, 285, 16, 145, 95]),\n", | |
" list([1, 14, 47, 8, 30, 31, 7, 4, 249, 108, 7, 4, 5974, 54, 61, 369, 13, 71, 149, 14, 22, 112, 4, 2401, 311, 12, 16, 3711, 33, 75, 43, 1829, 296, 4, 86, 320, 35, 534, 19, 263, 4821, 1301, 4, 1873, 33, 89, 78, 12, 66, 16, 4, 360, 7, 4, 58, 316, 334, 11, 4, 1716, 43, 645, 662, 8, 257, 85, 1200, 42, 1228, 2578, 83, 68, 3912, 15, 36, 165, 1539, 278, 36, 69, 44076, 780, 8, 106, 14, 6905, 1338, 18, 6, 22, 12, 215, 28, 610, 40, 6, 87, 326, 23, 2300, 21, 23, 22, 12, 272, 40, 57, 31, 11, 4, 22, 47, 6, 2307, 51, 9, 170, 23, 595, 116, 595, 1352, 13, 191, 79, 638, 89, 51428, 14, 9, 8, 106, 607, 624, 35, 534, 6, 227, 7, 129, 113]),\n", | |
" ...,\n", | |
" list([1, 11, 6, 230, 245, 6401, 9, 6, 1225, 446, 86527, 45, 2174, 84, 8322, 4007, 21, 4, 912, 84, 14532, 325, 725, 134, 15271, 1715, 84, 5, 36, 28, 57, 1099, 21, 8, 140, 8, 703, 5, 11656, 84, 56, 18, 1644, 14, 9, 31, 7, 4, 9406, 1209, 2295, 26094, 1008, 18, 6, 20, 207, 110, 563, 12, 8, 2901, 17793, 8, 97, 6, 20, 53, 4767, 74, 4, 460, 364, 1273, 29, 270, 11, 960, 108, 45, 40, 29, 2961, 395, 11, 6, 4065, 500, 7, 14492, 89, 364, 70, 29, 140, 4, 64, 4780, 11, 4, 2678, 26, 178, 4, 529, 443, 17793, 5, 27, 710, 117, 74936, 8123, 165, 47, 84, 37, 131, 818, 14, 595, 10, 10, 61, 1242, 1209, 10, 10, 288, 2260, 1702, 34, 2901, 17793, 4, 65, 496, 4, 231, 7, 790, 5, 6, 320, 234, 2766, 234, 1119, 1574, 7, 496, 4, 139, 929, 2901, 17793, 7750, 5, 4241, 18, 4, 8497, 13164, 250, 11, 1818, 7561, 4, 4217, 5408, 747, 1115, 372, 1890, 1006, 541, 9303, 7, 4, 59, 11027, 4, 3586, 22459]),\n", | |
" list([1, 1446, 7079, 69, 72, 3305, 13, 610, 930, 8, 12, 582, 23, 5, 16, 484, 685, 54, 349, 11, 4120, 2959, 45, 58, 1466, 13, 197, 12, 16, 43, 23, 21469, 5, 62, 30, 145, 402, 11, 4131, 51, 575, 32, 61, 369, 71, 66, 770, 12, 1054, 75, 100, 2198, 8, 4, 105, 37, 69, 147, 712, 75, 3543, 44, 257, 390, 5, 69, 263, 514, 105, 50, 286, 1814, 23, 4, 123, 13, 161, 40, 5, 421, 4, 116, 16, 897, 13, 40691, 40, 319, 5872, 112, 6700, 11, 4803, 121, 25, 70, 3468, 4, 719, 3798, 13, 18, 31, 62, 40, 8, 7200, 4, 29455, 7, 14, 123, 5, 942, 25, 8, 721, 12, 145, 5, 202, 12, 160, 580, 202, 12, 6, 52, 58, 11418, 92, 401, 728, 12, 39, 14, 251, 8, 15, 251, 5, 21213, 12, 38, 84, 80, 124, 12, 9, 23]),\n", | |
" list([1, 17, 6, 194, 337, 7, 4, 204, 22, 45, 254, 8, 106, 14, 123, 4, 12815, 270, 14437, 5, 16923, 12255, 732, 2098, 101, 405, 39, 14, 1034, 4, 1310, 9, 115, 50, 305, 12, 47, 4, 168, 5, 235, 7, 38, 111, 699, 102, 7, 4, 4039, 9245, 9, 24, 6, 78, 1099, 17, 2345, 16553, 21, 27, 9685, 6139, 5, 29043, 1603, 92, 1183, 4, 1310, 7, 4, 204, 42, 97, 90, 35, 221, 109, 29, 127, 27, 118, 8, 97, 12, 157, 21, 6789, 85010, 9, 6, 66, 78, 1099, 4, 631, 1191, 5, 2642, 272, 191, 1070, 6, 7585, 8, 2197, 70907, 10755, 544, 5, 383, 1271, 848, 1468, 12183, 497, 16876, 8, 1597, 8778, 19280, 21, 60, 27, 239, 9, 43, 8368, 209, 405, 10, 10, 12, 764, 40, 4, 248, 20, 12, 16, 5, 174, 1791, 72, 7, 51, 6, 1739, 22, 4, 204, 131, 9])],\n", | |
" dtype=object)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 7 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "xfMEeyleiDTc", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# 4.Munge Data" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "p4WtRR-GAvrB", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"It appears each element is a in the numpy array is a list of integers. This suggests that each integer encodes a word, which requires a dictionary in order to map it back to actual word." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "28eZ3r1ZAuqQ", | |
"colab_type": "code", | |
"outputId": "798280fa-9862-45ec-8c83-a59cbaf6daee", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"x_test.shape" | |
], | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(25000,)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 8 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "lYWdySV2A9b3", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Let us load the word index provided by the dataset." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ja-ioFlEA-Ck", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 52 | |
}, | |
"outputId": "e3733789-410e-4fb5-a925-88129af5cbf9" | |
}, | |
"source": [ | |
"word_index = tf.keras.datasets.imdb.get_word_index(path='imdb_word_index.json')" | |
], | |
"execution_count": 9, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json\n", | |
"1646592/1641221 [==============================] - 0s 0us/step\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "VGIlNfb_BCE4", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"The word index is a type of data structure known as dictionary, which is a key-value pair. Later we will use this as a basis to map integers back to words" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "p8gXp9iQBAe7", | |
"colab_type": "code", | |
"outputId": "a8af14c1-86d9-4cac-ff6d-435374875d37", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"type(word_index)" | |
], | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"dict" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 10 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "xD5F6zZsBD_k", | |
"colab_type": "code", | |
"outputId": "ecb395a1-3bb3-4b23-bb23-d246fb977fd3", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 1000 | |
} | |
}, | |
"source": [ | |
"word_index" | |
], | |
"execution_count": 11, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"{'fawn': 34701,\n", | |
" 'tsukino': 52006,\n", | |
" 'nunnery': 52007,\n", | |
" 'sonja': 16816,\n", | |
" 'vani': 63951,\n", | |
" 'woods': 1408,\n", | |
" 'spiders': 16115,\n", | |
" 'hanging': 2345,\n", | |
" 'woody': 2289,\n", | |
" 'trawling': 52008,\n", | |
" \"hold's\": 52009,\n", | |
" 'comically': 11307,\n", | |
" 'localized': 40830,\n", | |
" 'disobeying': 30568,\n", | |
" \"'royale\": 52010,\n", | |
" \"harpo's\": 40831,\n", | |
" 'canet': 52011,\n", | |
" 'aileen': 19313,\n", | |
" 'acurately': 52012,\n", | |
" \"diplomat's\": 52013,\n", | |
" 'rickman': 25242,\n", | |
" 'arranged': 6746,\n", | |
" 'rumbustious': 52014,\n", | |
" 'familiarness': 52015,\n", | |
" \"spider'\": 52016,\n", | |
" 'hahahah': 68804,\n", | |
" \"wood'\": 52017,\n", | |
" 'transvestism': 40833,\n", | |
" \"hangin'\": 34702,\n", | |
" 'bringing': 2338,\n", | |
" 'seamier': 40834,\n", | |
" 'wooded': 34703,\n", | |
" 'bravora': 52018,\n", | |
" 'grueling': 16817,\n", | |
" 'wooden': 1636,\n", | |
" 'wednesday': 16818,\n", | |
" \"'prix\": 52019,\n", | |
" 'altagracia': 34704,\n", | |
" 'circuitry': 52020,\n", | |
" 'crotch': 11585,\n", | |
" 'busybody': 57766,\n", | |
" \"tart'n'tangy\": 52021,\n", | |
" 'burgade': 14129,\n", | |
" 'thrace': 52023,\n", | |
" \"tom's\": 11038,\n", | |
" 'snuggles': 52025,\n", | |
" 'francesco': 29114,\n", | |
" 'complainers': 52027,\n", | |
" 'templarios': 52125,\n", | |
" '272': 40835,\n", | |
" '273': 52028,\n", | |
" 'zaniacs': 52130,\n", | |
" '275': 34706,\n", | |
" 'consenting': 27631,\n", | |
" 'snuggled': 40836,\n", | |
" 'inanimate': 15492,\n", | |
" 'uality': 52030,\n", | |
" 'bronte': 11926,\n", | |
" 'errors': 4010,\n", | |
" 'dialogs': 3230,\n", | |
" \"yomada's\": 52031,\n", | |
" \"madman's\": 34707,\n", | |
" 'dialoge': 30585,\n", | |
" 'usenet': 52033,\n", | |
" 'videodrome': 40837,\n", | |
" \"kid'\": 26338,\n", | |
" 'pawed': 52034,\n", | |
" \"'girlfriend'\": 30569,\n", | |
" \"'pleasure\": 52035,\n", | |
" \"'reloaded'\": 52036,\n", | |
" \"kazakos'\": 40839,\n", | |
" 'rocque': 52037,\n", | |
" 'mailings': 52038,\n", | |
" 'brainwashed': 11927,\n", | |
" 'mcanally': 16819,\n", | |
" \"tom''\": 52039,\n", | |
" 'kurupt': 25243,\n", | |
" 'affiliated': 21905,\n", | |
" 'babaganoosh': 52040,\n", | |
" \"noe's\": 40840,\n", | |
" 'quart': 40841,\n", | |
" 'kids': 359,\n", | |
" 'uplifting': 5034,\n", | |
" 'controversy': 7093,\n", | |
" 'kida': 21906,\n", | |
" 'kidd': 23379,\n", | |
" \"error'\": 52041,\n", | |
" 'neurologist': 52042,\n", | |
" 'spotty': 18510,\n", | |
" 'cobblers': 30570,\n", | |
" 'projection': 9878,\n", | |
" 'fastforwarding': 40842,\n", | |
" 'sters': 52043,\n", | |
" \"eggar's\": 52044,\n", | |
" 'etherything': 52045,\n", | |
" 'gateshead': 40843,\n", | |
" 'airball': 34708,\n", | |
" 'unsinkable': 25244,\n", | |
" 'stern': 7180,\n", | |
" \"cervi's\": 52046,\n", | |
" 'dnd': 40844,\n", | |
" 'dna': 11586,\n", | |
" 'insecurity': 20598,\n", | |
" \"'reboot'\": 52047,\n", | |
" 'trelkovsky': 11037,\n", | |
" 'jaekel': 52048,\n", | |
" 'sidebars': 52049,\n", | |
" \"sforza's\": 52050,\n", | |
" 'distortions': 17633,\n", | |
" 'mutinies': 52051,\n", | |
" 'sermons': 30602,\n", | |
" '7ft': 40846,\n", | |
" 'boobage': 52052,\n", | |
" \"o'bannon's\": 52053,\n", | |
" 'populations': 23380,\n", | |
" 'chulak': 52054,\n", | |
" 'mesmerize': 27633,\n", | |
" 'quinnell': 52055,\n", | |
" 'yahoo': 10307,\n", | |
" 'meteorologist': 52057,\n", | |
" 'beswick': 42577,\n", | |
" 'boorman': 15493,\n", | |
" 'voicework': 40847,\n", | |
" \"ster'\": 52058,\n", | |
" 'blustering': 22922,\n", | |
" 'hj': 52059,\n", | |
" 'intake': 27634,\n", | |
" 'morally': 5621,\n", | |
" 'jumbling': 40849,\n", | |
" 'bowersock': 52060,\n", | |
" \"'porky's'\": 52061,\n", | |
" 'gershon': 16821,\n", | |
" 'ludicrosity': 40850,\n", | |
" 'coprophilia': 52062,\n", | |
" 'expressively': 40851,\n", | |
" \"india's\": 19500,\n", | |
" \"post's\": 34710,\n", | |
" 'wana': 52063,\n", | |
" 'wang': 5283,\n", | |
" 'wand': 30571,\n", | |
" 'wane': 25245,\n", | |
" 'edgeways': 52321,\n", | |
" 'titanium': 34711,\n", | |
" 'pinta': 40852,\n", | |
" 'want': 178,\n", | |
" 'pinto': 30572,\n", | |
" 'whoopdedoodles': 52065,\n", | |
" 'tchaikovsky': 21908,\n", | |
" 'travel': 2103,\n", | |
" \"'victory'\": 52066,\n", | |
" 'copious': 11928,\n", | |
" 'gouge': 22433,\n", | |
" \"chapters'\": 52067,\n", | |
" 'barbra': 6702,\n", | |
" 'uselessness': 30573,\n", | |
" \"wan'\": 52068,\n", | |
" 'assimilated': 27635,\n", | |
" 'petiot': 16116,\n", | |
" 'most\\x85and': 52069,\n", | |
" 'dinosaurs': 3930,\n", | |
" 'wrong': 352,\n", | |
" 'seda': 52070,\n", | |
" 'stollen': 52071,\n", | |
" 'sentencing': 34712,\n", | |
" 'ouroboros': 40853,\n", | |
" 'assimilates': 40854,\n", | |
" 'colorfully': 40855,\n", | |
" 'glenne': 27636,\n", | |
" 'dongen': 52072,\n", | |
" 'subplots': 4760,\n", | |
" 'kiloton': 52073,\n", | |
" 'chandon': 23381,\n", | |
" \"effect'\": 34713,\n", | |
" 'snugly': 27637,\n", | |
" 'kuei': 40856,\n", | |
" 'welcomed': 9092,\n", | |
" 'dishonor': 30071,\n", | |
" 'concurrence': 52075,\n", | |
" 'stoicism': 23382,\n", | |
" \"guys'\": 14896,\n", | |
" \"beroemd'\": 52077,\n", | |
" 'butcher': 6703,\n", | |
" \"melfi's\": 40857,\n", | |
" 'aargh': 30623,\n", | |
" 'playhouse': 20599,\n", | |
" 'wickedly': 11308,\n", | |
" 'fit': 1180,\n", | |
" 'labratory': 52078,\n", | |
" 'lifeline': 40859,\n", | |
" 'screaming': 1927,\n", | |
" 'fix': 4287,\n", | |
" 'cineliterate': 52079,\n", | |
" 'fic': 52080,\n", | |
" 'fia': 52081,\n", | |
" 'fig': 34714,\n", | |
" 'fmvs': 52082,\n", | |
" 'fie': 52083,\n", | |
" 'reentered': 52084,\n", | |
" 'fin': 30574,\n", | |
" 'doctresses': 52085,\n", | |
" 'fil': 52086,\n", | |
" 'zucker': 12606,\n", | |
" 'ached': 31931,\n", | |
" 'counsil': 52088,\n", | |
" 'paterfamilias': 52089,\n", | |
" 'songwriter': 13885,\n", | |
" 'shivam': 34715,\n", | |
" 'hurting': 9654,\n", | |
" 'effects': 299,\n", | |
" 'slauther': 52090,\n", | |
" \"'flame'\": 52091,\n", | |
" 'sommerset': 52092,\n", | |
" 'interwhined': 52093,\n", | |
" 'whacking': 27638,\n", | |
" 'bartok': 52094,\n", | |
" 'barton': 8775,\n", | |
" 'frewer': 21909,\n", | |
" \"fi'\": 52095,\n", | |
" 'ingrid': 6192,\n", | |
" 'stribor': 30575,\n", | |
" 'approporiately': 52096,\n", | |
" 'wobblyhand': 52097,\n", | |
" 'tantalisingly': 52098,\n", | |
" 'ankylosaurus': 52099,\n", | |
" 'parasites': 17634,\n", | |
" 'childen': 52100,\n", | |
" \"jenkins'\": 52101,\n", | |
" 'metafiction': 52102,\n", | |
" 'golem': 17635,\n", | |
" 'indiscretion': 40860,\n", | |
" \"reeves'\": 23383,\n", | |
" \"inamorata's\": 57781,\n", | |
" 'brittannica': 52104,\n", | |
" 'adapt': 7916,\n", | |
" \"russo's\": 30576,\n", | |
" 'guitarists': 48246,\n", | |
" 'abbott': 10553,\n", | |
" 'abbots': 40861,\n", | |
" 'lanisha': 17649,\n", | |
" 'magickal': 40863,\n", | |
" 'mattter': 52105,\n", | |
" \"'willy\": 52106,\n", | |
" 'pumpkins': 34716,\n", | |
" 'stuntpeople': 52107,\n", | |
" 'estimate': 30577,\n", | |
" 'ugghhh': 40864,\n", | |
" 'gameplay': 11309,\n", | |
" \"wern't\": 52108,\n", | |
" \"n'sync\": 40865,\n", | |
" 'sickeningly': 16117,\n", | |
" 'chiara': 40866,\n", | |
" 'disturbed': 4011,\n", | |
" 'portmanteau': 40867,\n", | |
" 'ineffectively': 52109,\n", | |
" \"duchonvey's\": 82143,\n", | |
" \"nasty'\": 37519,\n", | |
" 'purpose': 1285,\n", | |
" 'lazers': 52112,\n", | |
" 'lightened': 28105,\n", | |
" 'kaliganj': 52113,\n", | |
" 'popularism': 52114,\n", | |
" \"damme's\": 18511,\n", | |
" 'stylistics': 30578,\n", | |
" 'mindgaming': 52115,\n", | |
" 'spoilerish': 46449,\n", | |
" \"'corny'\": 52117,\n", | |
" 'boerner': 34718,\n", | |
" 'olds': 6792,\n", | |
" 'bakelite': 52118,\n", | |
" 'renovated': 27639,\n", | |
" 'forrester': 27640,\n", | |
" \"lumiere's\": 52119,\n", | |
" 'gaskets': 52024,\n", | |
" 'needed': 884,\n", | |
" 'smight': 34719,\n", | |
" 'master': 1297,\n", | |
" \"edie's\": 25905,\n", | |
" 'seeber': 40868,\n", | |
" 'hiya': 52120,\n", | |
" 'fuzziness': 52121,\n", | |
" 'genesis': 14897,\n", | |
" 'rewards': 12607,\n", | |
" 'enthrall': 30579,\n", | |
" \"'about\": 40869,\n", | |
" \"recollection's\": 52122,\n", | |
" 'mutilated': 11039,\n", | |
" 'fatherlands': 52123,\n", | |
" \"fischer's\": 52124,\n", | |
" 'positively': 5399,\n", | |
" '270': 34705,\n", | |
" 'ahmed': 34720,\n", | |
" 'zatoichi': 9836,\n", | |
" 'bannister': 13886,\n", | |
" 'anniversaries': 52127,\n", | |
" \"helm's\": 30580,\n", | |
" \"'work'\": 52128,\n", | |
" 'exclaimed': 34721,\n", | |
" \"'unfunny'\": 52129,\n", | |
" '274': 52029,\n", | |
" 'feeling': 544,\n", | |
" \"wanda's\": 52131,\n", | |
" 'dolan': 33266,\n", | |
" '278': 52133,\n", | |
" 'peacoat': 52134,\n", | |
" 'brawny': 40870,\n", | |
" 'mishra': 40871,\n", | |
" 'worlders': 40872,\n", | |
" 'protags': 52135,\n", | |
" 'skullcap': 52136,\n", | |
" 'dastagir': 57596,\n", | |
" 'affairs': 5622,\n", | |
" 'wholesome': 7799,\n", | |
" 'hymen': 52137,\n", | |
" 'paramedics': 25246,\n", | |
" 'unpersons': 52138,\n", | |
" 'heavyarms': 52139,\n", | |
" 'affaire': 52140,\n", | |
" 'coulisses': 52141,\n", | |
" 'hymer': 40873,\n", | |
" 'kremlin': 52142,\n", | |
" 'shipments': 30581,\n", | |
" 'pixilated': 52143,\n", | |
" \"'00s\": 30582,\n", | |
" 'diminishing': 18512,\n", | |
" 'cinematic': 1357,\n", | |
" 'resonates': 14898,\n", | |
" 'simplify': 40874,\n", | |
" \"nature'\": 40875,\n", | |
" 'temptresses': 40876,\n", | |
" 'reverence': 16822,\n", | |
" 'resonated': 19502,\n", | |
" 'dailey': 34722,\n", | |
" '2\\x85': 52144,\n", | |
" 'treize': 27641,\n", | |
" 'majo': 52145,\n", | |
" 'kiya': 21910,\n", | |
" 'woolnough': 52146,\n", | |
" 'thanatos': 39797,\n", | |
" 'sandoval': 35731,\n", | |
" 'dorama': 40879,\n", | |
" \"o'shaughnessy\": 52147,\n", | |
" 'tech': 4988,\n", | |
" 'fugitives': 32018,\n", | |
" 'teck': 30583,\n", | |
" \"'e'\": 76125,\n", | |
" 'doesn’t': 40881,\n", | |
" 'purged': 52149,\n", | |
" 'saying': 657,\n", | |
" \"martians'\": 41095,\n", | |
" 'norliss': 23418,\n", | |
" 'dickey': 27642,\n", | |
" 'dicker': 52152,\n", | |
" \"'sependipity\": 52153,\n", | |
" 'padded': 8422,\n", | |
" 'ordell': 57792,\n", | |
" \"sturges'\": 40882,\n", | |
" 'independentcritics': 52154,\n", | |
" 'tempted': 5745,\n", | |
" \"atkinson's\": 34724,\n", | |
" 'hounded': 25247,\n", | |
" 'apace': 52155,\n", | |
" 'clicked': 15494,\n", | |
" \"'humor'\": 30584,\n", | |
" \"martino's\": 17177,\n", | |
" \"'supporting\": 52156,\n", | |
" 'warmongering': 52032,\n", | |
" \"zemeckis's\": 34725,\n", | |
" 'lube': 21911,\n", | |
" 'shocky': 52157,\n", | |
" 'plate': 7476,\n", | |
" 'plata': 40883,\n", | |
" 'sturgess': 40884,\n", | |
" \"nerds'\": 40885,\n", | |
" 'plato': 20600,\n", | |
" 'plath': 34726,\n", | |
" 'platt': 40886,\n", | |
" 'mcnab': 52159,\n", | |
" 'clumsiness': 27643,\n", | |
" 'altogether': 3899,\n", | |
" 'massacring': 42584,\n", | |
" 'bicenntinial': 52160,\n", | |
" 'skaal': 40887,\n", | |
" 'droning': 14360,\n", | |
" 'lds': 8776,\n", | |
" 'jaguar': 21912,\n", | |
" \"cale's\": 34727,\n", | |
" 'nicely': 1777,\n", | |
" 'mummy': 4588,\n", | |
" \"lot's\": 18513,\n", | |
" 'patch': 10086,\n", | |
" 'kerkhof': 50202,\n", | |
" \"leader's\": 52161,\n", | |
" \"'movie\": 27644,\n", | |
" 'uncomfirmed': 52162,\n", | |
" 'heirloom': 40888,\n", | |
" 'wrangle': 47360,\n", | |
" 'emotion\\x85': 52163,\n", | |
" \"'stargate'\": 52164,\n", | |
" 'pinoy': 40889,\n", | |
" 'conchatta': 40890,\n", | |
" 'broeke': 41128,\n", | |
" 'advisedly': 40891,\n", | |
" \"barker's\": 17636,\n", | |
" 'descours': 52166,\n", | |
" 'lots': 772,\n", | |
" 'lotr': 9259,\n", | |
" 'irs': 9879,\n", | |
" 'lott': 52167,\n", | |
" 'xvi': 40892,\n", | |
" 'irk': 34728,\n", | |
" 'irl': 52168,\n", | |
" 'ira': 6887,\n", | |
" 'belzer': 21913,\n", | |
" 'irc': 52169,\n", | |
" 'ire': 27645,\n", | |
" 'requisites': 40893,\n", | |
" 'discipline': 7693,\n", | |
" 'lyoko': 52961,\n", | |
" 'extend': 11310,\n", | |
" 'nature': 873,\n", | |
" \"'dickie'\": 52170,\n", | |
" 'optimist': 40894,\n", | |
" 'lapping': 30586,\n", | |
" 'superficial': 3900,\n", | |
" 'vestment': 52171,\n", | |
" 'extent': 2823,\n", | |
" 'tendons': 52172,\n", | |
" \"heller's\": 52173,\n", | |
" 'quagmires': 52174,\n", | |
" 'miyako': 52175,\n", | |
" 'moocow': 20601,\n", | |
" \"coles'\": 52176,\n", | |
" 'lookit': 40895,\n", | |
" 'ravenously': 52177,\n", | |
" 'levitating': 40896,\n", | |
" 'perfunctorily': 52178,\n", | |
" 'lookin': 30587,\n", | |
" \"lot'\": 40898,\n", | |
" 'lookie': 52179,\n", | |
" 'fearlessly': 34870,\n", | |
" 'libyan': 52181,\n", | |
" 'fondles': 40899,\n", | |
" 'gopher': 35714,\n", | |
" 'wearying': 40901,\n", | |
" \"nz's\": 52182,\n", | |
" 'minuses': 27646,\n", | |
" 'puposelessly': 52183,\n", | |
" 'shandling': 52184,\n", | |
" 'decapitates': 31268,\n", | |
" 'humming': 11929,\n", | |
" \"'nother\": 40902,\n", | |
" 'smackdown': 21914,\n", | |
" 'underdone': 30588,\n", | |
" 'frf': 40903,\n", | |
" 'triviality': 52185,\n", | |
" 'fro': 25248,\n", | |
" 'bothers': 8777,\n", | |
" \"'kensington\": 52186,\n", | |
" 'much': 73,\n", | |
" 'muco': 34730,\n", | |
" 'wiseguy': 22615,\n", | |
" \"richie's\": 27648,\n", | |
" 'tonino': 40904,\n", | |
" 'unleavened': 52187,\n", | |
" 'fry': 11587,\n", | |
" \"'tv'\": 40905,\n", | |
" 'toning': 40906,\n", | |
" 'obese': 14361,\n", | |
" 'sensationalized': 30589,\n", | |
" 'spiv': 40907,\n", | |
" 'spit': 6259,\n", | |
" 'arkin': 7364,\n", | |
" 'charleton': 21915,\n", | |
" 'jeon': 16823,\n", | |
" 'boardroom': 21916,\n", | |
" 'doubts': 4989,\n", | |
" 'spin': 3084,\n", | |
" 'hepo': 53083,\n", | |
" 'wildcat': 27649,\n", | |
" 'venoms': 10584,\n", | |
" 'misconstrues': 52191,\n", | |
" 'mesmerising': 18514,\n", | |
" 'misconstrued': 40908,\n", | |
" 'rescinds': 52192,\n", | |
" 'prostrate': 52193,\n", | |
" 'majid': 40909,\n", | |
" 'climbed': 16479,\n", | |
" 'canoeing': 34731,\n", | |
" 'majin': 52195,\n", | |
" 'animie': 57804,\n", | |
" 'sylke': 40910,\n", | |
" 'conditioned': 14899,\n", | |
" 'waddell': 40911,\n", | |
" '3\\x85': 52196,\n", | |
" 'hyperdrive': 41188,\n", | |
" 'conditioner': 34732,\n", | |
" 'bricklayer': 53153,\n", | |
" 'hong': 2576,\n", | |
" 'memoriam': 52198,\n", | |
" 'inventively': 30592,\n", | |
" \"levant's\": 25249,\n", | |
" 'portobello': 20638,\n", | |
" 'remand': 52200,\n", | |
" 'mummified': 19504,\n", | |
" 'honk': 27650,\n", | |
" 'spews': 19505,\n", | |
" 'visitations': 40912,\n", | |
" 'mummifies': 52201,\n", | |
" 'cavanaugh': 25250,\n", | |
" 'zeon': 23385,\n", | |
" \"jungle's\": 40913,\n", | |
" 'viertel': 34733,\n", | |
" 'frenchmen': 27651,\n", | |
" 'torpedoes': 52202,\n", | |
" 'schlessinger': 52203,\n", | |
" 'torpedoed': 34734,\n", | |
" 'blister': 69876,\n", | |
" 'cinefest': 52204,\n", | |
" 'furlough': 34735,\n", | |
" 'mainsequence': 52205,\n", | |
" 'mentors': 40914,\n", | |
" 'academic': 9094,\n", | |
" 'stillness': 20602,\n", | |
" 'academia': 40915,\n", | |
" 'lonelier': 52206,\n", | |
" 'nibby': 52207,\n", | |
" \"losers'\": 52208,\n", | |
" 'cineastes': 40916,\n", | |
" 'corporate': 4449,\n", | |
" 'massaging': 40917,\n", | |
" 'bellow': 30593,\n", | |
" 'absurdities': 19506,\n", | |
" 'expetations': 53241,\n", | |
" 'nyfiken': 40918,\n", | |
" 'mehras': 75638,\n", | |
" 'lasse': 52209,\n", | |
" 'visability': 52210,\n", | |
" 'militarily': 33946,\n", | |
" \"elder'\": 52211,\n", | |
" 'gainsbourg': 19023,\n", | |
" 'hah': 20603,\n", | |
" 'hai': 13420,\n", | |
" 'haj': 34736,\n", | |
" 'hak': 25251,\n", | |
" 'hal': 4311,\n", | |
" 'ham': 4892,\n", | |
" 'duffer': 53259,\n", | |
" 'haa': 52213,\n", | |
" 'had': 66,\n", | |
" 'advancement': 11930,\n", | |
" 'hag': 16825,\n", | |
" \"hand'\": 25252,\n", | |
" 'hay': 13421,\n", | |
" 'mcnamara': 20604,\n", | |
" \"mozart's\": 52214,\n", | |
" 'duffel': 30731,\n", | |
" 'haq': 30594,\n", | |
" 'har': 13887,\n", | |
" 'has': 44,\n", | |
" 'hat': 2401,\n", | |
" 'hav': 40919,\n", | |
" 'haw': 30595,\n", | |
" 'figtings': 52215,\n", | |
" 'elders': 15495,\n", | |
" 'underpanted': 52216,\n", | |
" 'pninson': 52217,\n", | |
" 'unequivocally': 27652,\n", | |
" \"barbara's\": 23673,\n", | |
" \"bello'\": 52219,\n", | |
" 'indicative': 12997,\n", | |
" 'yawnfest': 40920,\n", | |
" 'hexploitation': 52220,\n", | |
" \"loder's\": 52221,\n", | |
" 'sleuthing': 27653,\n", | |
" \"justin's\": 32622,\n", | |
" \"'ball\": 52222,\n", | |
" \"'summer\": 52223,\n", | |
" \"'demons'\": 34935,\n", | |
" \"mormon's\": 52225,\n", | |
" \"laughton's\": 34737,\n", | |
" 'debell': 52226,\n", | |
" 'shipyard': 39724,\n", | |
" 'unabashedly': 30597,\n", | |
" 'disks': 40401,\n", | |
" 'crowd': 2290,\n", | |
" 'crowe': 10087,\n", | |
" \"vancouver's\": 56434,\n", | |
" 'mosques': 34738,\n", | |
" 'crown': 6627,\n", | |
" 'culpas': 52227,\n", | |
" 'crows': 27654,\n", | |
" 'surrell': 53344,\n", | |
" 'flowless': 52229,\n", | |
" 'sheirk': 52230,\n", | |
" \"'three\": 40923,\n", | |
" \"peterson'\": 52231,\n", | |
" 'ooverall': 52232,\n", | |
" 'perchance': 40924,\n", | |
" 'bottom': 1321,\n", | |
" 'chabert': 53363,\n", | |
" 'sneha': 52233,\n", | |
" 'inhuman': 13888,\n", | |
" 'ichii': 52234,\n", | |
" 'ursla': 52235,\n", | |
" 'completly': 30598,\n", | |
" 'moviedom': 40925,\n", | |
" 'raddick': 52236,\n", | |
" 'brundage': 51995,\n", | |
" 'brigades': 40926,\n", | |
" 'starring': 1181,\n", | |
" \"'goal'\": 52237,\n", | |
" 'caskets': 52238,\n", | |
" 'willcock': 52239,\n", | |
" \"threesome's\": 52240,\n", | |
" \"mosque'\": 52241,\n", | |
" \"cover's\": 52242,\n", | |
" 'spaceships': 17637,\n", | |
" 'anomalous': 40927,\n", | |
" 'ptsd': 27655,\n", | |
" 'shirdan': 52243,\n", | |
" 'obscenity': 21962,\n", | |
" 'lemmings': 30599,\n", | |
" 'duccio': 30600,\n", | |
" \"levene's\": 52244,\n", | |
" \"'gorby'\": 52245,\n", | |
" \"teenager's\": 25255,\n", | |
" 'marshall': 5340,\n", | |
" 'honeymoon': 9095,\n", | |
" 'shoots': 3231,\n", | |
" 'despised': 12258,\n", | |
" 'okabasho': 52246,\n", | |
" 'fabric': 8289,\n", | |
" 'cannavale': 18515,\n", | |
" 'raped': 3537,\n", | |
" \"tutt's\": 52247,\n", | |
" 'grasping': 17638,\n", | |
" 'despises': 18516,\n", | |
" \"thief's\": 40928,\n", | |
" 'rapes': 8926,\n", | |
" 'raper': 52248,\n", | |
" \"eyre'\": 27656,\n", | |
" 'walchek': 52249,\n", | |
" \"elmo's\": 23386,\n", | |
" 'perfumes': 40929,\n", | |
" 'spurting': 21918,\n", | |
" \"exposition'\\x85\": 52250,\n", | |
" 'denoting': 52251,\n", | |
" 'thesaurus': 34740,\n", | |
" \"shoot'\": 40930,\n", | |
" 'bonejack': 49759,\n", | |
" 'simpsonian': 52253,\n", | |
" 'hebetude': 30601,\n", | |
" \"hallow's\": 34741,\n", | |
" 'desperation\\x85': 52254,\n", | |
" 'incinerator': 34742,\n", | |
" 'congratulations': 10308,\n", | |
" 'humbled': 52255,\n", | |
" \"else's\": 5924,\n", | |
" 'trelkovski': 40845,\n", | |
" \"rape'\": 52256,\n", | |
" \"'chapters'\": 59386,\n", | |
" '1600s': 52257,\n", | |
" 'martian': 7253,\n", | |
" 'nicest': 25256,\n", | |
" 'eyred': 52259,\n", | |
" 'passenger': 9457,\n", | |
" 'disgrace': 6041,\n", | |
" 'moderne': 52260,\n", | |
" 'barrymore': 5120,\n", | |
" 'yankovich': 52261,\n", | |
" 'moderns': 40931,\n", | |
" 'studliest': 52262,\n", | |
" 'bedsheet': 52263,\n", | |
" 'decapitation': 14900,\n", | |
" 'slurring': 52264,\n", | |
" \"'nunsploitation'\": 52265,\n", | |
" \"'character'\": 34743,\n", | |
" 'cambodia': 9880,\n", | |
" 'rebelious': 52266,\n", | |
" 'pasadena': 27657,\n", | |
" 'crowne': 40932,\n", | |
" \"'bedchamber\": 52267,\n", | |
" 'conjectural': 52268,\n", | |
" 'appologize': 52269,\n", | |
" 'halfassing': 52270,\n", | |
" 'paycheque': 57816,\n", | |
" 'palms': 20606,\n", | |
" \"'islands\": 52271,\n", | |
" 'hawked': 40933,\n", | |
" 'palme': 21919,\n", | |
" 'conservatively': 40934,\n", | |
" 'larp': 64007,\n", | |
" 'palma': 5558,\n", | |
" 'smelling': 21920,\n", | |
" 'aragorn': 12998,\n", | |
" 'hawker': 52272,\n", | |
" 'hawkes': 52273,\n", | |
" 'explosions': 3975,\n", | |
" 'loren': 8059,\n", | |
" \"pyle's\": 52274,\n", | |
" 'shootout': 6704,\n", | |
" \"mike's\": 18517,\n", | |
" \"driscoll's\": 52275,\n", | |
" 'cogsworth': 40935,\n", | |
" \"britian's\": 52276,\n", | |
" 'childs': 34744,\n", | |
" \"portrait's\": 52277,\n", | |
" 'chain': 3626,\n", | |
" 'whoever': 2497,\n", | |
" 'puttered': 52278,\n", | |
" 'childe': 52279,\n", | |
" 'maywether': 52280,\n", | |
" 'chair': 3036,\n", | |
" \"rance's\": 52281,\n", | |
" 'machu': 34745,\n", | |
" 'ballet': 4517,\n", | |
" 'grapples': 34746,\n", | |
" 'summerize': 76152,\n", | |
" 'freelance': 30603,\n", | |
" \"andrea's\": 52283,\n", | |
" '\\x91very': 52284,\n", | |
" 'coolidge': 45879,\n", | |
" 'mache': 18518,\n", | |
" 'balled': 52285,\n", | |
" 'grappled': 40937,\n", | |
" 'macha': 18519,\n", | |
" 'underlining': 21921,\n", | |
" 'macho': 5623,\n", | |
" 'oversight': 19507,\n", | |
" 'machi': 25257,\n", | |
" 'verbally': 11311,\n", | |
" 'tenacious': 21922,\n", | |
" 'windshields': 40938,\n", | |
" 'paychecks': 18557,\n", | |
" 'jerk': 3396,\n", | |
" \"good'\": 11931,\n", | |
" 'prancer': 34748,\n", | |
" 'prances': 21923,\n", | |
" 'olympus': 52286,\n", | |
" 'lark': 21924,\n", | |
" 'embark': 10785,\n", | |
" 'gloomy': 7365,\n", | |
" 'jehaan': 52287,\n", | |
" 'turaqui': 52288,\n", | |
" \"child'\": 20607,\n", | |
" 'locked': 2894,\n", | |
" 'pranced': 52289,\n", | |
" 'exact': 2588,\n", | |
" 'unattuned': 52290,\n", | |
" 'minute': 783,\n", | |
" 'skewed': 16118,\n", | |
" 'hodgins': 40940,\n", | |
" 'skewer': 34749,\n", | |
" 'think\\x85': 52291,\n", | |
" 'rosenstein': 38765,\n", | |
" 'helmit': 52292,\n", | |
" 'wrestlemanias': 34750,\n", | |
" 'hindered': 16826,\n", | |
" \"martha's\": 30604,\n", | |
" 'cheree': 52293,\n", | |
" \"pluckin'\": 52294,\n", | |
" 'ogles': 40941,\n", | |
" 'heavyweight': 11932,\n", | |
" 'aada': 82190,\n", | |
" 'chopping': 11312,\n", | |
" 'strongboy': 61534,\n", | |
" 'hegemonic': 41342,\n", | |
" 'adorns': 40942,\n", | |
" 'xxth': 41346,\n", | |
" 'nobuhiro': 34751,\n", | |
" 'capitães': 52298,\n", | |
" 'kavogianni': 52299,\n", | |
" 'antwerp': 13422,\n", | |
" 'celebrated': 6538,\n", | |
" 'roarke': 52300,\n", | |
" 'baggins': 40943,\n", | |
" 'cheeseburgers': 31270,\n", | |
" 'matras': 52301,\n", | |
" \"nineties'\": 52302,\n", | |
" \"'craig'\": 52303,\n", | |
" 'celebrates': 12999,\n", | |
" 'unintentionally': 3383,\n", | |
" 'drafted': 14362,\n", | |
" 'climby': 52304,\n", | |
" '303': 52305,\n", | |
" 'oldies': 18520,\n", | |
" 'climbs': 9096,\n", | |
" 'honour': 9655,\n", | |
" 'plucking': 34752,\n", | |
" '305': 30074,\n", | |
" 'address': 5514,\n", | |
" 'menjou': 40944,\n", | |
" \"'freak'\": 42592,\n", | |
" 'dwindling': 19508,\n", | |
" 'benson': 9458,\n", | |
" 'white’s': 52307,\n", | |
" 'shamelessness': 40945,\n", | |
" 'impacted': 21925,\n", | |
" 'upatz': 52308,\n", | |
" 'cusack': 3840,\n", | |
" \"flavia's\": 37567,\n", | |
" 'effette': 52309,\n", | |
" 'influx': 34753,\n", | |
" 'boooooooo': 52310,\n", | |
" 'dimitrova': 52311,\n", | |
" 'houseman': 13423,\n", | |
" 'bigas': 25259,\n", | |
" 'boylen': 52312,\n", | |
" 'phillipenes': 52313,\n", | |
" 'fakery': 40946,\n", | |
" \"grandpa's\": 27658,\n", | |
" 'darnell': 27659,\n", | |
" 'undergone': 19509,\n", | |
" 'handbags': 52315,\n", | |
" 'perished': 21926,\n", | |
" 'pooped': 37778,\n", | |
" 'vigour': 27660,\n", | |
" 'opposed': 3627,\n", | |
" 'etude': 52316,\n", | |
" \"caine's\": 11799,\n", | |
" 'doozers': 52317,\n", | |
" 'photojournals': 34754,\n", | |
" 'perishes': 52318,\n", | |
" 'constrains': 34755,\n", | |
" 'migenes': 40948,\n", | |
" 'consoled': 30605,\n", | |
" 'alastair': 16827,\n", | |
" 'wvs': 52319,\n", | |
" 'ooooooh': 52320,\n", | |
" 'approving': 34756,\n", | |
" 'consoles': 40949,\n", | |
" 'disparagement': 52064,\n", | |
" 'futureistic': 52322,\n", | |
" 'rebounding': 52323,\n", | |
" \"'date\": 52324,\n", | |
" 'gregoire': 52325,\n", | |
" 'rutherford': 21927,\n", | |
" 'americanised': 34757,\n", | |
" 'novikov': 82196,\n", | |
" 'following': 1042,\n", | |
" 'munroe': 34758,\n", | |
" \"morita'\": 52326,\n", | |
" 'christenssen': 52327,\n", | |
" 'oatmeal': 23106,\n", | |
" 'fossey': 25260,\n", | |
" 'livered': 40950,\n", | |
" 'listens': 13000,\n", | |
" \"'marci\": 76164,\n", | |
" \"otis's\": 52330,\n", | |
" 'thanking': 23387,\n", | |
" 'maude': 16019,\n", | |
" 'extensions': 34759,\n", | |
" 'ameteurish': 52332,\n", | |
" \"commender's\": 52333,\n", | |
" 'agricultural': 27661,\n", | |
" 'convincingly': 4518,\n", | |
" 'fueled': 17639,\n", | |
" 'mahattan': 54014,\n", | |
" \"paris's\": 40952,\n", | |
" 'vulkan': 52336,\n", | |
" 'stapes': 52337,\n", | |
" 'odysessy': 52338,\n", | |
" 'harmon': 12259,\n", | |
" 'surfing': 4252,\n", | |
" 'halloran': 23494,\n", | |
" 'unbelieveably': 49580,\n", | |
" \"'offed'\": 52339,\n", | |
" 'quadrant': 30607,\n", | |
" 'inhabiting': 19510,\n", | |
" 'nebbish': 34760,\n", | |
" 'forebears': 40953,\n", | |
" 'skirmish': 34761,\n", | |
" 'ocassionally': 52340,\n", | |
" \"'resist\": 52341,\n", | |
" 'impactful': 21928,\n", | |
" 'spicier': 52342,\n", | |
" 'touristy': 40954,\n", | |
" \"'football'\": 52343,\n", | |
" 'webpage': 40955,\n", | |
" 'exurbia': 52345,\n", | |
" 'jucier': 52346,\n", | |
" 'professors': 14901,\n", | |
" 'structuring': 34762,\n", | |
" 'jig': 30608,\n", | |
" 'overlord': 40956,\n", | |
" 'disconnect': 25261,\n", | |
" 'sniffle': 82201,\n", | |
" 'slimeball': 40957,\n", | |
" 'jia': 40958,\n", | |
" 'milked': 16828,\n", | |
" 'banjoes': 40959,\n", | |
" 'jim': 1237,\n", | |
" 'workforces': 52348,\n", | |
" 'jip': 52349,\n", | |
" 'rotweiller': 52350,\n", | |
" 'mundaneness': 34763,\n", | |
" \"'ninja'\": 52351,\n", | |
" \"dead'\": 11040,\n", | |
" \"cipriani's\": 40960,\n", | |
" 'modestly': 20608,\n", | |
" \"professor'\": 52352,\n", | |
" 'shacked': 40961,\n", | |
" 'bashful': 34764,\n", | |
" 'sorter': 23388,\n", | |
" 'overpowering': 16120,\n", | |
" 'workmanlike': 18521,\n", | |
" 'henpecked': 27662,\n", | |
" 'sorted': 18522,\n", | |
" \"jōb's\": 52354,\n", | |
" \"'always\": 52355,\n", | |
" \"'baptists\": 34765,\n", | |
" 'dreamcatchers': 52356,\n", | |
" \"'silence'\": 52357,\n", | |
" 'hickory': 21929,\n", | |
" 'fun\\x97yet': 52358,\n", | |
" 'breakumentary': 52359,\n", | |
" 'didn': 15496,\n", | |
" 'didi': 52360,\n", | |
" 'pealing': 52361,\n", | |
" 'dispite': 40962,\n", | |
" \"italy's\": 25262,\n", | |
" 'instability': 21930,\n", | |
" 'quarter': 6539,\n", | |
" 'quartet': 12608,\n", | |
" 'padmé': 52362,\n", | |
" \"'bleedmedry\": 52363,\n", | |
" 'pahalniuk': 52364,\n", | |
" 'honduras': 52365,\n", | |
" 'bursting': 10786,\n", | |
" \"pablo's\": 41465,\n", | |
" 'irremediably': 52367,\n", | |
" 'presages': 40963,\n", | |
" 'bowlegged': 57832,\n", | |
" 'dalip': 65183,\n", | |
" 'entering': 6260,\n", | |
" 'newsradio': 76172,\n", | |
" 'presaged': 54150,\n", | |
" \"giallo's\": 27663,\n", | |
" 'bouyant': 40964,\n", | |
" 'amerterish': 52368,\n", | |
" 'rajni': 18523,\n", | |
" 'leeves': 30610,\n", | |
" 'macauley': 34767,\n", | |
" 'seriously': 612,\n", | |
" 'sugercoma': 52369,\n", | |
" 'grimstead': 52370,\n", | |
" \"'fairy'\": 52371,\n", | |
" 'zenda': 30611,\n", | |
" \"'twins'\": 52372,\n", | |
" 'realisation': 17640,\n", | |
" 'highsmith': 27664,\n", | |
" 'raunchy': 7817,\n", | |
" 'incentives': 40965,\n", | |
" 'flatson': 52374,\n", | |
" 'snooker': 35097,\n", | |
" 'crazies': 16829,\n", | |
" 'crazier': 14902,\n", | |
" 'grandma': 7094,\n", | |
" 'napunsaktha': 52375,\n", | |
" 'workmanship': 30612,\n", | |
" 'reisner': 52376,\n", | |
" \"sanford's\": 61306,\n", | |
" '\\x91doña': 52377,\n", | |
" 'modest': 6108,\n", | |
" \"everything's\": 19153,\n", | |
" 'hamer': 40966,\n", | |
" \"couldn't'\": 52379,\n", | |
" 'quibble': 13001,\n", | |
" 'socking': 52380,\n", | |
" 'tingler': 21931,\n", | |
" 'gutman': 52381,\n", | |
" 'lachlan': 40967,\n", | |
" 'tableaus': 52382,\n", | |
" 'headbanger': 52383,\n", | |
" 'spoken': 2847,\n", | |
" 'cerebrally': 34768,\n", | |
" \"'road\": 23490,\n", | |
" 'tableaux': 21932,\n", | |
" \"proust's\": 40968,\n", | |
" 'periodical': 40969,\n", | |
" \"shoveller's\": 52385,\n", | |
" 'tamara': 25263,\n", | |
" 'affords': 17641,\n", | |
" 'concert': 3249,\n", | |
" \"yara's\": 87955,\n", | |
" 'someome': 52386,\n", | |
" 'lingering': 8424,\n", | |
" \"abraham's\": 41511,\n", | |
" 'beesley': 34769,\n", | |
" 'cherbourg': 34770,\n", | |
" 'kagan': 28624,\n", | |
" 'snatch': 9097,\n", | |
" \"miyazaki's\": 9260,\n", | |
" 'absorbs': 25264,\n", | |
" \"koltai's\": 40970,\n", | |
" 'tingled': 64027,\n", | |
" 'crossroads': 19511,\n", | |
" 'rehab': 16121,\n", | |
" 'falworth': 52389,\n", | |
" 'sequals': 52390,\n", | |
" ...}" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 11 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "1Y_JHcUkBJKd", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"We can also find out how many unique words this dictionary contains." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ofKbFQR9BGEl", | |
"colab_type": "code", | |
"outputId": "61fbdc8d-df79-45c6-8591-7c2194101611", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"len(word_index)" | |
], | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"88584" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 12 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "EeTNEtJwBMLt", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Now we know there are 88584 unique words (tokens) in the imdb dataset. Each word has a unique number associated with it. This structure is known as key-value pair. Therefore, there are 88584 key-value pairs, organized as a dictionary in Python datra structure. As examples, below are a few words (tokens) in this dictionary" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "4DSf_0tBBKtH", | |
"colab_type": "code", | |
"outputId": "fdda7b4c-6a70-4e49-eee2-d648e3b53648", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 354 | |
} | |
}, | |
"source": [ | |
"{k:v for (k,v) in word_index.items() if v < 20}" | |
], | |
"execution_count": 13, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"{'a': 3,\n", | |
" 'and': 2,\n", | |
" 'as': 14,\n", | |
" 'br': 7,\n", | |
" 'but': 18,\n", | |
" 'film': 19,\n", | |
" 'for': 15,\n", | |
" 'i': 10,\n", | |
" 'in': 8,\n", | |
" 'is': 6,\n", | |
" 'it': 9,\n", | |
" 'movie': 17,\n", | |
" 'of': 4,\n", | |
" 'that': 12,\n", | |
" 'the': 1,\n", | |
" 'this': 11,\n", | |
" 'to': 5,\n", | |
" 'was': 13,\n", | |
" 'with': 16}" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 13 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "yUIom0-NBSif", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Now lets add a few special words for use later. This is a common practice for NLP problem, where it is important to bring a consistency to text strings by giving it a start, a token to handle words outside this dictionary, as well as padding to ensure all text data have same length. We bump original words by three positions, and appended the following new words to the dictionary. We also reverse the key-value relationship and created a new dictionary for reverse lookup. In addition, we created a function `decode_review` to convert data from integer into words." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "liqZzBcOBNz9", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"\"\"# The first indices are reserved\n", | |
"word_index = {k:(v+3) for k,v in word_index.items()} \n", | |
"word_index[\"<PAD>\"] = 0\n", | |
"word_index[\"<START>\"] = 1\n", | |
"word_index[\"<UNK>\"] = 2 # unknown\n", | |
"word_index[\"<UNUSED>\"] = 3\n", | |
"\n", | |
"reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])\n", | |
"\n", | |
"def decode_review(text):\n", | |
" return ' '.join([reverse_word_index.get(i, '?') for i in text])" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "HyrZPrwgBWgF", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"example1 = decode_review(x_train[0])" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Jkka0q2RBXsU", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"new_x_train=x_train.reshape(len(x_train), 1)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "S-EDAP6NBYnV", | |
"colab_type": "code", | |
"outputId": "6db09542-1ea0-43d5-d7d4-a2addf9d6d2f", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"mylen = np.vectorize(len)\n", | |
"print(mylen(x_train))" | |
], | |
"execution_count": 17, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"[218 189 141 ... 184 150 153]\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "vGTzBi6LBfXD", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Now find the index for positive and negative reviews" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "EQMIKqblBZci", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"positive_index = np.where(y_train == 1) \n", | |
"negative_index = np.where(y_train == 0)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "8iibMRrWBhu0", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"positive_reviews = x_train[positive_index]\n", | |
"negative_reviews = x_train[negative_index]" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "JKc4btE2Biuq", | |
"colab_type": "code", | |
"outputId": "5ddc9bec-5474-4379-9ece-f312050bd6ba", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"np.shape(positive_reviews)" | |
], | |
"execution_count": 20, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(12500,)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 20 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "R057R0rAoRxX", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"#5.Explore Data - Part 2" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "6JVEvikRCkWF", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Select 50% of positive reviews and drop rest of them (setting the stage for later, where we oversample them)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "uM_UkYsrFkfQ", | |
"colab_type": "code", | |
"outputId": "7742d20c-bf0f-47b2-9c3a-fcd90fe92634", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"positive_index" | |
], | |
"execution_count": 21, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(array([ 0, 3, 6, ..., 24994, 24995, 24998]),)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 21 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "fmDyaeykG6u6", | |
"colab_type": "code", | |
"outputId": "98cdb997-983f-48d7-cfc4-c3e2cd3d9e62", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 54 | |
} | |
}, | |
"source": [ | |
"print(x_test[0])" | |
], | |
"execution_count": 22, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"[1, 591, 202, 14, 31, 6, 717, 10, 10, 18142, 10698, 5, 4, 360, 7, 4, 177, 5760, 394, 354, 4, 123, 9, 1035, 1035, 1035, 10, 10, 13, 92, 124, 89, 488, 7944, 100, 28, 1668, 14, 31, 23, 27, 7479, 29, 220, 468, 8, 124, 14, 286, 170, 8, 157, 46, 5, 27, 239, 16, 179, 15387, 38, 32, 25, 7944, 451, 202, 14, 6, 717]\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Z_Vg4CS1H0Bc", | |
"colab_type": "code", | |
"outputId": "9a89189a-1ce9-4bef-d2e1-6f86c6d356ef", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 54 | |
} | |
}, | |
"source": [ | |
"print(x_train[0])" | |
], | |
"execution_count": 23, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "yvKgL7P8IEgP", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"There is no direct label in x_train or x_test. Will have to use y_test i.e. \n", | |
"\n", | |
"positive_index = np.where(y_train == 1) \n", | |
"\n", | |
"to get a subset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "RGWHRtdGH5X9", | |
"colab_type": "code", | |
"outputId": "4a473bb0-5108-4d12-ae3a-4c0c5fcd80b4", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"positive_index" | |
], | |
"execution_count": 24, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(array([ 0, 3, 6, ..., 24994, 24995, 24998]),)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 24 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "-d63V8cLMmB9", | |
"colab_type": "code", | |
"outputId": "5321990b-6964-4073-bb51-595c2adf1024", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"positive_reviews.shape" | |
], | |
"execution_count": 25, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(12500,)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 25 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "1PkGWrcrTft1", | |
"colab_type": "code", | |
"outputId": "71fb1db3-123b-48c6-be0d-d448e743be50", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 90 | |
} | |
}, | |
"source": [ | |
"positive_reviews[0:2]" | |
], | |
"execution_count": 26, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),\n", | |
" list([1, 4, 18609, 16085, 33, 2804, 4, 2040, 432, 111, 153, 103, 4, 1494, 13, 70, 131, 67, 11, 61, 15305, 744, 35, 3715, 761, 61, 5766, 452, 9214, 4, 985, 7, 64317, 59, 166, 4, 105, 216, 1239, 41, 1797, 9, 15, 7, 35, 744, 2413, 31, 8, 4, 687, 23, 4, 33929, 7339, 6, 3693, 42, 38, 39, 121, 59, 456, 10, 10, 7, 265, 12, 575, 111, 153, 159, 59, 16, 1447, 21, 25, 586, 482, 39, 4, 96, 59, 716, 12, 4, 172, 65, 9, 579, 11, 6004, 4, 1615, 5, 23005, 7, 5168, 17, 13, 7064, 12, 19, 6, 464, 31, 314, 11, 87564, 6, 719, 605, 11, 8, 202, 27, 310, 4, 3772, 3501, 8, 2722, 58, 10, 10, 537, 2116, 180, 40, 14, 413, 173, 7, 263, 112, 37, 152, 377, 4, 537, 263, 846, 579, 178, 54, 75, 71, 476, 36, 413, 263, 2504, 182, 5, 17, 75, 2306, 922, 36, 279, 131, 2895, 17, 2867, 42, 17, 35, 921, 18435, 192, 5, 1219, 3890, 19, 20523, 217, 4122, 1710, 537, 20341, 1236, 5, 736, 10, 10, 61, 403, 9, 47289, 40, 61, 4494, 5, 27, 4494, 159, 90, 263, 2311, 4319, 309, 8, 178, 5, 82, 4319, 4, 65, 15, 9225, 145, 143, 5122, 12, 7039, 537, 746, 537, 537, 15, 7979, 4, 18665, 594, 7, 5168, 94, 9096, 3987, 15242, 11, 28280, 4, 538, 7, 1795, 246, 56615, 9, 10161, 11, 635, 14, 9, 51, 408, 12, 94, 318, 1382, 12, 47, 6, 2683, 936, 5, 6307, 10197, 19, 49, 7, 4, 1885, 13699, 1118, 25, 80, 126, 842, 10, 10, 47289, 18223, 4726, 27, 4494, 11, 1550, 3633, 159, 27, 341, 29, 2733, 19, 4185, 173, 7, 90, 16376, 8, 30, 11, 4, 1784, 86, 1117, 8, 3261, 46, 11, 25837, 21, 29, 9, 2841, 23, 4, 1010, 26747, 793, 6, 13699, 1386, 1830, 10, 10, 246, 50, 9, 6, 2750, 1944, 746, 90, 29, 16376, 8, 124, 4, 882, 4, 882, 496, 27, 33029, 2213, 537, 121, 127, 1219, 130, 5, 29, 494, 8, 124, 4, 882, 496, 4, 341, 7, 27, 846, 10, 10, 29, 9, 1906, 8, 97, 6, 236, 11120, 1311, 8, 4, 23643, 7, 31, 7, 29851, 91, 22793, 3987, 70, 4, 882, 30, 579, 42, 9, 12, 32, 11, 537, 10, 10, 11, 14, 65, 44, 537, 75, 11876, 1775, 3353, 12716, 1846, 4, 11286, 7, 154, 5, 4, 518, 53, 13243, 11286, 7, 3211, 882, 11, 399, 38, 75, 257, 3807, 19, 18223, 17, 29, 456, 4, 65, 7, 27, 205, 113, 10, 10, 33058, 4, 22793, 10359, 9, 242, 4, 91, 1202, 11377, 5, 2070, 307, 22, 7, 5168, 126, 93, 40, 18223, 13, 188, 1076, 3222, 19, 4, 13465, 7, 2348, 537, 23, 53, 537, 21, 82, 40, 18223, 13, 33195, 14, 280, 13, 219, 4, 52788, 431, 758, 859, 4, 953, 1052, 12283, 7, 5991, 5, 94, 40, 25, 238, 60, 35410, 4, 15812, 804, 27767, 7, 4, 9941, 132, 8, 67, 6, 22, 15, 9, 283, 8, 5168, 14, 31, 9, 242, 955, 48, 25, 279, 22148, 23, 12, 1685, 195, 25, 238, 60, 796, 13713, 4, 671, 7, 2804, 5, 4, 559, 154, 888, 7, 726, 50, 26, 49, 7008, 15, 566, 30, 579, 21, 64, 2574])],\n", | |
" dtype=object)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 26 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "BaxD--O7oi3z", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"#6.Subset of Pos Reviews\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Ydk9EyBTk8eN", | |
"colab_type": "code", | |
"outputId": "bf2dabd8-ad9b-4244-b3bf-f2dc8c2c54e6", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"np.shape(positive_index)" | |
], | |
"execution_count": 27, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(1, 12500)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 27 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "stbZAgeDlL2J", | |
"colab_type": "code", | |
"outputId": "3e887134-1758-4f10-f8aa-bcdd02203aa8", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"np.shape(positive_reviews)[0]" | |
], | |
"execution_count": 28, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"12500" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 28 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "8cymNV2Xowkd", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"We need to get 12,500 indices to match the original pos set" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "y_jx1Hl9lga-", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"i_pos_elements = np.shape(positive_reviews)[0]" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "d548jGSmlqCi", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"subset_positive_index = np.random.choice(positive_index[0],i_pos_elements,replace=False)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "th3Bpc_Nlufc", | |
"colab_type": "code", | |
"outputId": "6386a5d1-3f65-4760-8755-7156e2ab8eae", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"subset_positive_index" | |
], | |
"execution_count": 31, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"array([ 1073, 11742, 12781, ..., 15330, 229, 10568])" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 31 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "MTdlXmq8lzV1", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"subset_positive_reviews = x_train[subset_positive_index]" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "YjTlS2-ll6sv", | |
"colab_type": "code", | |
"outputId": "14f7c440-141f-4afb-ea1b-8c342300b116", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 178 | |
} | |
}, | |
"source": [ | |
"subset_positive_reviews" | |
], | |
"execution_count": 33, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"array([list([1, 165, 14, 20, 16, 24, 38, 78, 12, 1367, 206, 212, 5, 2318, 50, 26, 52, 156, 11, 14, 22, 18, 1825, 7517, 39973, 18746, 39, 4, 1419, 3693, 37, 299, 18063, 160, 73, 573, 284, 9, 3546, 4224, 39, 2039, 5, 289, 8822, 4, 293, 105, 26, 256, 34, 3546, 5788, 17, 6184, 37, 16, 184, 52, 5, 82, 163, 21, 4, 31, 37, 91, 770, 72, 16, 628, 8335, 17, 4500, 39520, 29, 299, 6, 275, 109, 74, 29, 633, 127, 88, 11, 85, 108, 40, 4, 1419, 3693, 1395, 5808, 4, 31025, 42, 4, 43737, 29, 299, 6, 55, 2259, 415, 5, 11, 7242, 4, 299, 220, 4, 1961, 6, 132, 209, 101, 1438, 63, 16, 327, 8, 67, 4, 64, 66, 1566, 155, 44, 14, 22, 26, 4, 450, 1268, 7, 4, 182, 3331, 2216, 63, 166, 14, 22, 382, 168, 6, 117, 1967, 444, 13, 197, 14, 16, 6, 184, 52, 117, 22]),\n", | |
" list([1, 4, 12475, 9, 6, 680, 22, 94, 293, 109, 5264, 256, 19, 35, 1732, 1493, 7, 1382, 5, 39453, 34, 3977, 9, 24, 129, 801, 632, 5264, 9, 6, 569, 132, 37, 9, 6606, 6, 522, 1696, 113, 18358, 260, 1084, 5284, 153, 11, 4, 5951, 7, 1043, 6448, 588, 29, 150, 659, 309, 11600, 46, 5, 2724, 2610, 5, 38, 103, 6, 580, 2180, 33, 6, 1449, 19, 1738, 5060, 6832, 29, 26797, 23, 5, 778, 6, 27792, 2094, 1862, 1738, 4, 7769, 327, 232, 9, 1951, 19, 49, 538, 11, 27, 205, 113, 5, 882, 30, 579, 100, 361, 6, 464, 17, 73, 4, 107, 97, 35, 2076, 2025, 5, 1738, 3775, 187, 8, 842, 21409, 65, 60, 103, 2799, 4, 13534, 882, 44, 21409, 157, 10, 10, 12475, 6077, 6, 875, 24, 340, 12916, 7, 11, 438, 4, 1210, 632, 5059, 108, 40, 79218, 5, 76289, 15587, 72568, 216, 8, 330, 21, 12475, 12677, 11, 450, 1317, 771, 86, 7, 32, 4, 880, 5, 599, 9, 6270, 21, 115, 66, 617, 11, 101, 1589, 1217, 15, 48, 25, 26, 35, 206, 20, 4301, 267, 18, 35, 10563, 3409, 14, 20, 80, 242, 4363, 25, 5, 333, 1025, 91, 1210, 632, 108, 12475, 166, 57, 589, 8, 123, 10506, 5, 3265, 39, 94, 293, 109, 21409, 292, 9, 331, 1353, 17, 35, 21175, 9, 51, 12, 25671, 243, 7, 155, 14, 9, 1732, 348, 15, 3816, 3816, 7, 178, 62, 1133, 880, 18, 278, 2993, 5, 246, 14, 12894, 1483, 9, 382, 51, 166, 4, 22, 235, 2902, 261, 75, 92, 40, 8, 974, 12, 220, 233, 100, 413, 4885, 103, 75, 122, 12, 196, 195, 279, 60, 588, 122, 21409, 1484, 1833, 8, 783, 37, 9, 2648, 8, 28, 84, 556, 37, 694, 4, 20, 115, 2033, 19, 134, 1204, 4, 1152, 9, 23, 5264, 5, 27, 9129, 12807, 83, 6, 2038, 1862, 48, 25, 332, 44, 294, 40, 5264, 11, 4, 2300, 25, 62, 28, 6, 2581, 197, 15, 84, 40, 90, 144, 30, 3314, 46, 7, 926, 40, 6, 5388, 21, 918, 8, 106, 27, 113, 25, 26, 1309, 11, 34, 27, 1596, 1946, 2506, 18, 4, 2113, 13, 482, 10, 10, 14256, 193, 23, 5264, 9, 73, 224, 5, 1685, 4333, 29, 152, 340, 4313, 309, 39, 27, 592, 1648, 52, 272, 5, 3557, 1382, 21, 247, 43, 1608, 1193, 8828, 83, 4, 1493, 916, 42, 2625, 4613, 11, 4, 655, 9796, 3958, 5, 2117, 7038, 39, 8122, 1382, 8, 3787, 18792, 5, 619, 1680, 16043, 18, 5607, 12, 941, 25, 3099, 44, 27, 4400, 23, 27, 7617, 5, 89, 12, 80, 4525, 148, 187, 90, 45, 6, 227, 40, 2621, 8, 6, 3654, 1799, 15, 2036, 5, 5167, 1936, 6, 355, 854, 137, 29, 299, 21, 12, 495, 4, 108, 64, 85, 678, 217, 15, 7, 1738, 9, 24, 754, 17, 14411, 6832, 505, 11, 6, 1156, 48, 29732, 239, 17, 6, 1988, 914, 19, 6, 1988, 914, 113, 5, 712, 10, 10, 4, 22, 152, 66, 28, 101, 666, 7639, 42, 1983, 314, 27759, 1299, 21, 13, 286, 502, 8, 482, 4, 277, 5, 12, 421, 2349, 12, 152, 28, 101, 933, 4345, 42, 3580, 6102, 5, 246, 12, 421, 55, 406, 5, 12, 161, 28, 101, 483, 11960, 519, 3227, 42, 1056, 3353, 5, 246, 13, 197, 4, 1794, 16, 73, 224, 5, 13, 16, 115, 1100, 279, 4, 64, 147, 749, 133, 9, 44, 4, 406, 359, 8, 2101, 46, 5, 97, 5670, 19, 31, 160, 5, 89, 148, 738, 28, 57, 1515, 42657, 60, 6, 2748, 738, 369, 5, 60, 52, 84, 70, 30, 369, 19, 78, 84, 45, 6, 931, 23, 4, 680, 1526, 182, 75, 412, 11, 6, 52, 22, 290, 319]),\n", | |
" list([1, 13, 104, 14, 20, 69, 8, 30, 253, 8, 97, 12, 18, 178, 12, 16, 253, 8, 106, 12, 4, 156, 168, 40, 36, 28, 6, 253, 58, 61, 7576, 40, 4, 430, 156, 5, 61, 9344, 40, 4, 250, 156, 24, 55, 76, 81, 75, 79, 8, 28, 932, 253, 19, 6, 20, 15, 9, 189, 97, 13, 67, 6, 176, 7, 629, 102, 5, 13, 62, 106, 14, 31, 32, 295, 280, 53, 42, 53, 88, 75, 462, 295, 48, 14, 156, 97, 85, 629, 102, 13, 80, 106, 98, 4, 18468, 1168, 132, 1584, 1313, 8, 516, 4, 156, 9, 55, 76, 6, 52, 78, 132, 29, 97, 178, 462, 295, 4, 91, 13, 62, 202, 14, 20, 6, 312, 603, 48, 25, 942, 72, 10, 10, 13, 92, 124, 48, 4, 2336, 47, 101, 53, 7, 4, 102, 19, 4, 156, 21, 4, 293, 430, 9, 1036, 4, 284, 19, 4, 1758, 4539, 47, 8, 30, 24, 147, 36, 152, 168, 8, 147]),\n", | |
" ...,\n", | |
" list([1, 4, 291, 5125, 54, 38, 111, 84, 3543, 44, 4, 87, 5195, 7, 5991, 34, 3774, 8294, 82, 219, 160, 55, 478, 246, 2184, 20, 23, 4536, 479, 2098, 2356, 261, 12, 9, 6, 55, 275, 22, 262, 688, 8, 4, 1379, 810, 4, 65, 9, 270, 11, 2098, 2356, 47, 76, 11, 1141, 24, 64, 19, 2996, 5991, 21, 82, 19, 4, 55, 1103, 7, 1711, 102, 10, 10, 12, 9, 6, 22, 15, 14181, 729, 1326, 40, 548, 18, 3764, 3158, 2898, 1060, 223, 112, 6, 1317, 8, 2652, 12, 82, 832, 178, 8, 4, 55, 8514, 7, 4536, 19108, 121, 4, 406, 1357, 659, 94, 360, 112, 3391, 34, 32, 11116, 7, 876, 670, 2356, 14107, 6642, 9181, 4, 293, 109, 9, 6, 283, 632, 38, 2460, 11, 7445, 432, 7, 4536, 2163, 3102, 37, 3047, 8, 485, 27, 84, 46, 7, 11644, 10766, 34, 1043, 2287, 11551, 261, 29, 24835, 27, 904, 143, 4, 1421, 47, 8, 2712, 6, 176, 107, 183, 789, 11, 27, 330, 6238, 2898, 15, 29, 9, 348, 34, 309, 5, 119, 8, 27, 255, 1083, 14107, 2875, 9510, 15, 832, 90, 8, 2076, 687, 10, 10, 549, 18, 4, 221, 1500, 5, 179, 3983, 206, 4, 20, 9, 1061, 19, 371, 1380, 2057, 14, 2367, 47, 8, 81, 199, 19, 4, 732, 5, 1979, 7, 4, 22, 17, 73, 17, 4, 6139, 111, 906, 388, 789, 11, 4, 330, 7, 101, 529, 37, 70, 1741, 533, 35, 2618, 83, 1614, 1218, 18, 72, 4, 91, 3620, 136, 16, 11, 4, 19108, 54, 2098, 2356, 716, 27, 1013, 51, 2898, 66, 817, 95, 29, 3292, 98, 245, 5, 1293, 166, 119, 8, 27, 255, 4, 136, 29, 2949, 48187, 7, 13839, 308, 1487, 8, 4, 10353, 9, 82, 290, 6, 168, 17, 6, 1380, 1114, 10, 10, 7, 265, 50, 9, 49, 2177, 567, 40, 11, 4, 6934, 18, 1825, 21, 13, 92, 104, 15, 14, 567, 62, 30, 17, 17828, 17, 11, 111, 85, 682, 108, 94, 8508, 9, 40, 101, 85, 65509, 8482, 5, 5614, 7, 148, 211, 71, 66, 622, 5, 50, 62, 30, 57, 213, 11, 3208, 12, 4, 91, 1193, 136, 18, 72, 16, 4, 1520, 224, 23, 2098, 18069, 322, 34, 4, 1019, 7, 4, 65, 15385, 7098, 1756, 4915, 45, 371, 2317, 5, 362, 144, 407, 789, 245, 190, 32, 4, 360, 9, 1604, 2295, 246, 50, 9, 31, 1251, 15, 93, 72, 66, 119, 14, 20, 4, 354, 10, 10, 32, 4, 177, 81, 404, 2632, 39, 4, 971, 15628, 9181, 37, 2352, 55, 73, 8, 4, 217, 7, 3676, 2510, 565, 4536, 132, 8, 4, 696, 177, 7, 1620, 3585, 37, 2235, 3799, 33852, 6, 1295, 2413, 7, 394, 1421, 37, 764, 3969, 19, 4, 445, 7, 325, 2875, 9510, 9, 55, 478, 17, 1083, 14107, 5, 47, 49, 7, 4, 91, 307, 388, 11, 4, 22, 50, 9, 1175, 200, 1563, 9510, 5, 443, 9181, 11, 111, 7, 68, 139, 308, 1487, 31, 7, 4, 118, 698, 156, 127, 6, 1307, 292, 17, 48187, 7, 13839, 6, 12325, 132, 18, 937, 278, 9, 4, 5529, 11, 410, 3301, 34, 101, 817, 13, 40, 15, 75872, 7, 27, 1146, 21, 4, 147, 1019, 9, 256, 34, 1756, 4915, 37, 371, 4661, 5137, 499, 7, 27, 109, 15385, 6, 132, 37, 18573, 119, 37, 1388, 325, 5, 37, 659, 283, 4021, 11, 1520, 5, 4682, 21, 40, 11, 101, 52, 1711, 14, 3160, 445, 215, 169, 94, 130, 10, 10, 5, 31, 53, 1251, 4, 621, 603, 141, 906, 5, 3178, 3729, 15, 26, 2725, 8, 481, 11, 4, 4371, 18, 196, 4, 477, 561, 2842, 72, 8, 1674, 24, 64, 88, 7, 4, 936, 12, 5603, 21, 88, 13, 1685, 2505, 134, 24356, 3729, 19, 4, 11116, 7, 1979, 5168, 1287, 11, 4, 483, 7, 94, 8082, 5, 14, 20, 1578, 72, 7, 15, 7965, 962, 12, 16, 17, 48, 61, 333, 1311, 8, 5168, 10, 10, 2098, 2356, 9, 6, 55, 327, 20, 55, 73, 526, 3551, 5, 917, 12, 382, 127, 24, 3215, 5991, 19, 94, 2093, 732, 5, 9909, 7, 2260, 11, 985, 139, 21, 12, 9, 6, 1018, 196, 22, 19, 76, 692, 2611, 23, 31, 55, 2681, 792, 6, 438, 144, 28, 1380, 722, 2505, 19, 5348, 4421, 13, 66, 510, 15, 22, 81, 24, 9298, 8, 683, 12, 21677, 19108, 7, 722, 5, 967, 12, 790, 158]),\n", | |
" list([1, 13, 69, 2721, 557, 7, 1674, 7, 11126, 6, 137, 145, 5, 12, 2774, 40, 142, 474, 30, 83, 21, 19, 32, 4, 108, 13, 28, 582, 11, 23, 6, 1988, 2857, 12, 1932, 1583, 125, 61, 9959, 137, 10847, 187, 4, 719, 53162, 13, 8478, 4, 953, 18, 14, 31, 5, 7258, 12, 56, 13, 28, 8, 135, 146, 184, 1264, 13, 122, 1674, 7, 11126, 9, 6, 680, 2207, 518, 2612, 948, 22, 63, 9, 643, 22202, 34, 45, 540, 364, 352, 21, 9, 131, 35, 441, 5, 2640, 106, 10, 10, 1674, 7, 11126, 4615, 187, 4, 2615, 2853, 448, 1848, 38637, 1215, 604, 15, 9302, 10702, 5, 85, 12505, 11, 4, 4856, 7, 4432, 5, 81062, 4, 19271, 2419, 2624, 21, 684, 134, 3362, 157, 345, 32, 99, 73, 42, 24, 73, 195, 5597, 23, 129, 7172, 17, 465, 1929, 26, 24, 64, 65724, 21, 82, 7954, 725, 5680, 1484, 10, 10, 4, 22, 9, 579, 6828, 405, 19, 6, 346, 21, 906, 5, 744, 636, 7228, 720, 5, 95, 17520, 83, 4, 289, 537, 15, 97, 56, 4, 6137, 7, 4, 22, 10, 10, 4, 86, 12271, 9, 44, 6, 3944, 37, 4726, 31, 7, 4, 1215, 1066, 37, 9, 112, 1428, 33, 6, 1752, 1550, 4, 3944, 271, 11, 467, 1108, 7, 1786, 8, 2301, 4, 1848, 38637, 1215, 21, 75, 169, 15, 41, 283, 4204, 203, 569, 6, 117, 2439, 8, 344, 54, 4, 2704, 304, 6, 1115, 471, 4, 3944, 659, 15, 59, 203, 28, 1840, 11, 120, 41, 419, 10, 10, 4, 333, 173, 32535, 3277, 6, 1115, 185, 132, 37, 9, 9646, 8, 1987, 16124, 11, 18899, 7, 6, 1172, 4130, 18, 3318, 6, 185, 132, 83, 6, 7264, 75, 169, 15, 4, 9121, 1042, 11, 888, 9, 165, 6, 1848, 38637, 17047, 5, 27, 22737, 4874, 26, 230, 39, 4, 5797, 10, 10, 4, 2722, 65, 11126, 3054, 187, 6, 37078, 1804, 18051, 5, 27, 3217, 37, 3126, 10274, 18, 6, 12405, 54, 4, 18051, 10932, 1525, 31, 7, 27, 13635, 5, 165, 35842, 6, 1147, 63, 69, 77, 13717, 41, 4, 2734, 9, 150, 879, 8, 15480, 5, 267, 18, 6, 162, 3232, 10, 10, 13, 3525, 135, 13, 66, 510, 1674, 7, 11126, 50, 26, 49, 4410, 19, 4, 22, 15, 401, 12, 39, 112, 371, 321, 21, 12, 9, 35, 204, 5, 3548, 22, 18, 51, 12, 9, 61, 1126, 10679, 19, 4, 365, 9, 4, 338, 5, 3658, 120, 2278, 414, 4, 3110, 9, 1492, 2291, 5, 13, 62, 28, 76, 5947, 8, 28, 6, 9048, 5450, 19, 4, 204, 1101, 1406, 49, 1989, 28, 301, 4, 116, 9, 338, 63, 13, 92, 2707, 1041, 19, 13, 104, 15, 4, 3110, 9, 38, 583, 18172, 15, 12, 166, 4, 354, 306, 18577, 63, 218, 66, 4, 420, 11, 192, 6, 171, 7, 4, 354, 26, 184, 1543, 2860, 4, 1042, 11, 4, 333, 2052, 5, 4, 8603, 11, 4, 840, 6918, 216, 8, 330, 5, 2878, 4, 599, 3709, 26, 55, 73, 224, 18, 6, 364, 352, 22, 19, 49, 2177, 139, 7, 36662, 14765, 2864, 77306, 20569, 6, 14144, 11, 4, 3681, 1720, 49, 542, 21, 16759, 5088, 532, 38675, 5, 6, 171, 85, 13373, 1376, 11, 18, 52, 4160, 24, 17, 2683, 17, 49, 7, 4, 53, 1572, 599, 108, 46, 50, 21, 407, 3417, 74, 129, 856, 189, 2396, 13, 82, 258, 4, 65, 3828, 4, 1215, 604, 8, 30, 1774, 5, 6, 2378, 653, 8, 4, 801, 189, 65, 1835, 50, 26, 958, 7, 139, 7, 2038, 841, 5, 1074, 4, 1232, 7, 63, 13, 774, 216, 638, 11, 6, 137, 261, 3048, 11, 49, 7664, 771, 13, 131, 104, 1674, 7, 11126, 80, 1271, 8, 91, 2862, 189, 797, 49, 139, 203, 1970, 99, 76, 18, 4, 53, 2483, 529, 407, 1178, 709, 457, 158]),\n", | |
" list([1, 6, 223, 1586, 304, 178, 145, 8, 6, 329, 2732, 58, 11, 938, 45, 24325, 8, 67, 89, 275, 285, 16, 145, 95, 12, 16, 6, 53, 1356, 999, 11, 263, 704, 5, 75, 106, 6, 84798, 223, 1951, 11, 183, 295, 4, 22, 82, 3734, 4, 454, 7, 4, 201, 1890, 4, 2966, 223, 10, 10, 4, 22, 526, 34, 742, 50482, 9, 448, 23, 6, 1112, 297, 1919, 592, 2966, 5, 27, 322, 81794, 26, 4432, 6, 4816, 3112, 15, 215, 30, 3343, 19, 1021, 577, 1820, 266, 344, 103, 59, 47, 2534, 6, 1031, 24916, 5, 41, 658, 1777, 41, 1893, 33, 4, 172, 58, 160, 577, 5103, 961, 344, 6, 9756, 37, 9, 788, 80, 6427, 19, 41, 336, 4, 654, 1629, 27519, 1798, 186, 8, 30, 4, 64, 31, 209, 6, 439, 366, 27, 452, 166, 90, 12246, 6290, 8, 4, 837, 142, 29, 9, 5526, 8, 81, 10, 10, 2964, 8, 135, 1919, 2966, 80, 1970, 138, 29, 694, 118, 17, 29, 1457, 6, 1347, 83, 206, 8, 79, 316, 295, 174, 103, 32, 29, 9, 6, 132, 15, 5627, 24, 64, 4, 1164, 21, 89, 8, 855, 19, 148, 1005, 1929, 15, 5589, 27, 2090, 11, 4, 1830, 5, 51, 80, 97, 27, 223, 654, 10, 10, 6994, 5123, 299, 1919, 2966, 19, 5380, 29, 9, 4, 12303, 15, 1777, 285, 295, 3673, 25658, 9, 110, 17, 3736, 4, 452, 4111, 5209, 47, 6, 392, 173, 11, 14, 22, 21, 29, 9, 17, 210, 253, 8, 106, 14596, 2238, 5, 2806, 58533, 1482, 17, 4, 2847, 5103, 5, 1820, 8479, 9948, 5, 3962, 9615, 26, 82, 2561, 11, 4, 22, 17, 2965, 80068, 5, 6290, 4, 250, 15, 2459, 1798, 19, 41, 936, 10, 10, 6, 223, 1586, 9, 6, 52, 96, 8, 7679, 263, 501, 143, 4, 1123, 1459, 4218, 7, 35, 298, 223])],\n", | |
" dtype=object)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 33 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "C0sH_u3rnN5I", | |
"colab_type": "code", | |
"outputId": "182c4972-cd08-4a83-9628-bd90db39306a", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"np.shape(subset_positive_reviews)" | |
], | |
"execution_count": 34, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(12500,)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 34 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "a7ale079naVz", | |
"colab_type": "code", | |
"outputId": "db11bec4-2b37-473d-8aa0-ee517611318a", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"np.shape(positive_reviews)" | |
], | |
"execution_count": 35, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(12500,)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 35 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "86dyVvB_nSVU", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Means we have got enough resampling done to equal the starting state of positive reviews" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "I0_KMCT9m_Ih", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# 7.Concatenating Pos Subset + Neg\n", | |
"\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "gksOAgtdNBbe", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### Combining Data (Pos + Neg)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "-bl7UySbsf0q", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"new_data = np.concatenate((subset_positive_reviews, negative_reviews))" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "kMVAZHl9Z1Oo", | |
"colab_type": "code", | |
"outputId": "030c605f-7b42-47b5-9ceb-9bff3afe27a8", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 90 | |
} | |
}, | |
"source": [ | |
"new_data[0:2]" | |
], | |
"execution_count": 37, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"array([list([1, 165, 14, 20, 16, 24, 38, 78, 12, 1367, 206, 212, 5, 2318, 50, 26, 52, 156, 11, 14, 22, 18, 1825, 7517, 39973, 18746, 39, 4, 1419, 3693, 37, 299, 18063, 160, 73, 573, 284, 9, 3546, 4224, 39, 2039, 5, 289, 8822, 4, 293, 105, 26, 256, 34, 3546, 5788, 17, 6184, 37, 16, 184, 52, 5, 82, 163, 21, 4, 31, 37, 91, 770, 72, 16, 628, 8335, 17, 4500, 39520, 29, 299, 6, 275, 109, 74, 29, 633, 127, 88, 11, 85, 108, 40, 4, 1419, 3693, 1395, 5808, 4, 31025, 42, 4, 43737, 29, 299, 6, 55, 2259, 415, 5, 11, 7242, 4, 299, 220, 4, 1961, 6, 132, 209, 101, 1438, 63, 16, 327, 8, 67, 4, 64, 66, 1566, 155, 44, 14, 22, 26, 4, 450, 1268, 7, 4, 182, 3331, 2216, 63, 166, 14, 22, 382, 168, 6, 117, 1967, 444, 13, 197, 14, 16, 6, 184, 52, 117, 22]),\n", | |
" list([1, 4, 12475, 9, 6, 680, 22, 94, 293, 109, 5264, 256, 19, 35, 1732, 1493, 7, 1382, 5, 39453, 34, 3977, 9, 24, 129, 801, 632, 5264, 9, 6, 569, 132, 37, 9, 6606, 6, 522, 1696, 113, 18358, 260, 1084, 5284, 153, 11, 4, 5951, 7, 1043, 6448, 588, 29, 150, 659, 309, 11600, 46, 5, 2724, 2610, 5, 38, 103, 6, 580, 2180, 33, 6, 1449, 19, 1738, 5060, 6832, 29, 26797, 23, 5, 778, 6, 27792, 2094, 1862, 1738, 4, 7769, 327, 232, 9, 1951, 19, 49, 538, 11, 27, 205, 113, 5, 882, 30, 579, 100, 361, 6, 464, 17, 73, 4, 107, 97, 35, 2076, 2025, 5, 1738, 3775, 187, 8, 842, 21409, 65, 60, 103, 2799, 4, 13534, 882, 44, 21409, 157, 10, 10, 12475, 6077, 6, 875, 24, 340, 12916, 7, 11, 438, 4, 1210, 632, 5059, 108, 40, 79218, 5, 76289, 15587, 72568, 216, 8, 330, 21, 12475, 12677, 11, 450, 1317, 771, 86, 7, 32, 4, 880, 5, 599, 9, 6270, 21, 115, 66, 617, 11, 101, 1589, 1217, 15, 48, 25, 26, 35, 206, 20, 4301, 267, 18, 35, 10563, 3409, 14, 20, 80, 242, 4363, 25, 5, 333, 1025, 91, 1210, 632, 108, 12475, 166, 57, 589, 8, 123, 10506, 5, 3265, 39, 94, 293, 109, 21409, 292, 9, 331, 1353, 17, 35, 21175, 9, 51, 12, 25671, 243, 7, 155, 14, 9, 1732, 348, 15, 3816, 3816, 7, 178, 62, 1133, 880, 18, 278, 2993, 5, 246, 14, 12894, 1483, 9, 382, 51, 166, 4, 22, 235, 2902, 261, 75, 92, 40, 8, 974, 12, 220, 233, 100, 413, 4885, 103, 75, 122, 12, 196, 195, 279, 60, 588, 122, 21409, 1484, 1833, 8, 783, 37, 9, 2648, 8, 28, 84, 556, 37, 694, 4, 20, 115, 2033, 19, 134, 1204, 4, 1152, 9, 23, 5264, 5, 27, 9129, 12807, 83, 6, 2038, 1862, 48, 25, 332, 44, 294, 40, 5264, 11, 4, 2300, 25, 62, 28, 6, 2581, 197, 15, 84, 40, 90, 144, 30, 3314, 46, 7, 926, 40, 6, 5388, 21, 918, 8, 106, 27, 113, 25, 26, 1309, 11, 34, 27, 1596, 1946, 2506, 18, 4, 2113, 13, 482, 10, 10, 14256, 193, 23, 5264, 9, 73, 224, 5, 1685, 4333, 29, 152, 340, 4313, 309, 39, 27, 592, 1648, 52, 272, 5, 3557, 1382, 21, 247, 43, 1608, 1193, 8828, 83, 4, 1493, 916, 42, 2625, 4613, 11, 4, 655, 9796, 3958, 5, 2117, 7038, 39, 8122, 1382, 8, 3787, 18792, 5, 619, 1680, 16043, 18, 5607, 12, 941, 25, 3099, 44, 27, 4400, 23, 27, 7617, 5, 89, 12, 80, 4525, 148, 187, 90, 45, 6, 227, 40, 2621, 8, 6, 3654, 1799, 15, 2036, 5, 5167, 1936, 6, 355, 854, 137, 29, 299, 21, 12, 495, 4, 108, 64, 85, 678, 217, 15, 7, 1738, 9, 24, 754, 17, 14411, 6832, 505, 11, 6, 1156, 48, 29732, 239, 17, 6, 1988, 914, 19, 6, 1988, 914, 113, 5, 712, 10, 10, 4, 22, 152, 66, 28, 101, 666, 7639, 42, 1983, 314, 27759, 1299, 21, 13, 286, 502, 8, 482, 4, 277, 5, 12, 421, 2349, 12, 152, 28, 101, 933, 4345, 42, 3580, 6102, 5, 246, 12, 421, 55, 406, 5, 12, 161, 28, 101, 483, 11960, 519, 3227, 42, 1056, 3353, 5, 246, 13, 197, 4, 1794, 16, 73, 224, 5, 13, 16, 115, 1100, 279, 4, 64, 147, 749, 133, 9, 44, 4, 406, 359, 8, 2101, 46, 5, 97, 5670, 19, 31, 160, 5, 89, 148, 738, 28, 57, 1515, 42657, 60, 6, 2748, 738, 369, 5, 60, 52, 84, 70, 30, 369, 19, 78, 84, 45, 6, 931, 23, 4, 680, 1526, 182, 75, 412, 11, 6, 52, 22, 290, 319])],\n", | |
" dtype=object)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 37 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "vkiqmgi1CX5s", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Random Diagnostics" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "bIve8VZyCamL", | |
"colab_type": "code", | |
"outputId": "d59bb26f-bd5b-4fb7-c6ed-5cddc9898edf", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 52 | |
} | |
}, | |
"source": [ | |
"length = [len(i) for i in new_data]\n", | |
"print(\"Average Review length:\", np.mean(length))\n", | |
"print(\"Standard Deviation:\", round(np.std(length)))" | |
], | |
"execution_count": 38, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Average Review length: 238.71364\n", | |
"Standard Deviation: 176.0\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Pl3q7hCENHxj", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### Combining Labels" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "ErVxZa38aIBA", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Repeat the process done for data to labels as well." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "a2keaVDlZ4qS", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"positive_labels = y_train[positive_index]" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "4NEDYgo6bxR-", | |
"colab_type": "code", | |
"outputId": "1321c69b-5d03-4c41-cab3-1962746a851f", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"positive_labels" | |
], | |
"execution_count": 40, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"array([1, 1, 1, ..., 1, 1, 1])" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 40 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "6p3eqH_ecDZn", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"negative_labels = y_train[negative_index]" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "S_F71vRvcNjR", | |
"colab_type": "code", | |
"outputId": "6f9382d8-13ad-44b4-ccf2-9e722efd8f58", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"negative_labels" | |
], | |
"execution_count": 42, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"array([0, 0, 0, ..., 0, 0, 0])" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 42 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "7FgdN8PjCnPN", | |
"colab_type": "code", | |
"outputId": "ed3bf7d9-f1af-477a-aa06-e72bdbaea6c1", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 54 | |
} | |
}, | |
"source": [ | |
"print(new_data[0])" | |
], | |
"execution_count": 43, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"[1, 165, 14, 20, 16, 24, 38, 78, 12, 1367, 206, 212, 5, 2318, 50, 26, 52, 156, 11, 14, 22, 18, 1825, 7517, 39973, 18746, 39, 4, 1419, 3693, 37, 299, 18063, 160, 73, 573, 284, 9, 3546, 4224, 39, 2039, 5, 289, 8822, 4, 293, 105, 26, 256, 34, 3546, 5788, 17, 6184, 37, 16, 184, 52, 5, 82, 163, 21, 4, 31, 37, 91, 770, 72, 16, 628, 8335, 17, 4500, 39520, 29, 299, 6, 275, 109, 74, 29, 633, 127, 88, 11, 85, 108, 40, 4, 1419, 3693, 1395, 5808, 4, 31025, 42, 4, 43737, 29, 299, 6, 55, 2259, 415, 5, 11, 7242, 4, 299, 220, 4, 1961, 6, 132, 209, 101, 1438, 63, 16, 327, 8, 67, 4, 64, 66, 1566, 155, 44, 14, 22, 26, 4, 450, 1268, 7, 4, 182, 3331, 2216, 63, 166, 14, 22, 382, 168, 6, 117, 1967, 444, 13, 197, 14, 16, 6, 184, 52, 117, 22]\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "3CEuC0UbC2MJ", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"new_labels = np.concatenate((positive_labels,negative_labels))" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "2kqWSioMQKNP", | |
"colab_type": "code", | |
"outputId": "7498a0e8-41b3-489c-a20d-73ddee962171", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"np.shape(new_labels)" | |
], | |
"execution_count": 45, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(25000,)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 45 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "nvIYfRyHDPvp", | |
"colab_type": "code", | |
"outputId": "5ab54bc5-7dd0-469d-b8d8-322e56cf2b82", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"print(new_labels[0])" | |
], | |
"execution_count": 46, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"1\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Ao8XR3YmN7l8", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## To Be Fixed" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "VwclAcmsOckl", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"The dataset needs to be shuffled, as all pos are on top and neg on bottom. The challenge is corresponding labels are in the same order. \n", | |
"\n", | |
"We need a way to shuffle them together, one way would have been to combine data and labels in one container, shuffle it and then split it back. But unable to combine them in a reasonable shape/form" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "72Deph3uO_ik", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"new_all = np.hstack((new_data,new_labels))" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "BZjEmvvDPhLy", | |
"colab_type": "code", | |
"outputId": "008bfcd8-64a6-44e7-b0df-5ccfe74da5d1", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"print(np.shape(new_all))" | |
], | |
"execution_count": 48, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"(50000,)\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Lto9UqPbPeQd", | |
"colab_type": "code", | |
"outputId": "713a1d69-0191-495f-a325-e641746eeb76", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 54 | |
} | |
}, | |
"source": [ | |
"print(new_all[0])" | |
], | |
"execution_count": 49, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"[1, 165, 14, 20, 16, 24, 38, 78, 12, 1367, 206, 212, 5, 2318, 50, 26, 52, 156, 11, 14, 22, 18, 1825, 7517, 39973, 18746, 39, 4, 1419, 3693, 37, 299, 18063, 160, 73, 573, 284, 9, 3546, 4224, 39, 2039, 5, 289, 8822, 4, 293, 105, 26, 256, 34, 3546, 5788, 17, 6184, 37, 16, 184, 52, 5, 82, 163, 21, 4, 31, 37, 91, 770, 72, 16, 628, 8335, 17, 4500, 39520, 29, 299, 6, 275, 109, 74, 29, 633, 127, 88, 11, 85, 108, 40, 4, 1419, 3693, 1395, 5808, 4, 31025, 42, 4, 43737, 29, 299, 6, 55, 2259, 415, 5, 11, 7242, 4, 299, 220, 4, 1961, 6, 132, 209, 101, 1438, 63, 16, 327, 8, 67, 4, 64, 66, 1566, 155, 44, 14, 22, 26, 4, 450, 1268, 7, 4, 182, 3331, 2216, 63, 166, 14, 22, 382, 168, 6, 117, 1967, 444, 13, 197, 14, 16, 6, 184, 52, 117, 22]\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "zC75DhrNPpZ2", | |
"colab_type": "code", | |
"outputId": "df969761-4c2e-42da-f75d-4c233f14479a", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 54 | |
} | |
}, | |
"source": [ | |
"print(new_all[1])" | |
], | |
"execution_count": 50, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"[1, 4, 12475, 9, 6, 680, 22, 94, 293, 109, 5264, 256, 19, 35, 1732, 1493, 7, 1382, 5, 39453, 34, 3977, 9, 24, 129, 801, 632, 5264, 9, 6, 569, 132, 37, 9, 6606, 6, 522, 1696, 113, 18358, 260, 1084, 5284, 153, 11, 4, 5951, 7, 1043, 6448, 588, 29, 150, 659, 309, 11600, 46, 5, 2724, 2610, 5, 38, 103, 6, 580, 2180, 33, 6, 1449, 19, 1738, 5060, 6832, 29, 26797, 23, 5, 778, 6, 27792, 2094, 1862, 1738, 4, 7769, 327, 232, 9, 1951, 19, 49, 538, 11, 27, 205, 113, 5, 882, 30, 579, 100, 361, 6, 464, 17, 73, 4, 107, 97, 35, 2076, 2025, 5, 1738, 3775, 187, 8, 842, 21409, 65, 60, 103, 2799, 4, 13534, 882, 44, 21409, 157, 10, 10, 12475, 6077, 6, 875, 24, 340, 12916, 7, 11, 438, 4, 1210, 632, 5059, 108, 40, 79218, 5, 76289, 15587, 72568, 216, 8, 330, 21, 12475, 12677, 11, 450, 1317, 771, 86, 7, 32, 4, 880, 5, 599, 9, 6270, 21, 115, 66, 617, 11, 101, 1589, 1217, 15, 48, 25, 26, 35, 206, 20, 4301, 267, 18, 35, 10563, 3409, 14, 20, 80, 242, 4363, 25, 5, 333, 1025, 91, 1210, 632, 108, 12475, 166, 57, 589, 8, 123, 10506, 5, 3265, 39, 94, 293, 109, 21409, 292, 9, 331, 1353, 17, 35, 21175, 9, 51, 12, 25671, 243, 7, 155, 14, 9, 1732, 348, 15, 3816, 3816, 7, 178, 62, 1133, 880, 18, 278, 2993, 5, 246, 14, 12894, 1483, 9, 382, 51, 166, 4, 22, 235, 2902, 261, 75, 92, 40, 8, 974, 12, 220, 233, 100, 413, 4885, 103, 75, 122, 12, 196, 195, 279, 60, 588, 122, 21409, 1484, 1833, 8, 783, 37, 9, 2648, 8, 28, 84, 556, 37, 694, 4, 20, 115, 2033, 19, 134, 1204, 4, 1152, 9, 23, 5264, 5, 27, 9129, 12807, 83, 6, 2038, 1862, 48, 25, 332, 44, 294, 40, 5264, 11, 4, 2300, 25, 62, 28, 6, 2581, 197, 15, 84, 40, 90, 144, 30, 3314, 46, 7, 926, 40, 6, 5388, 21, 918, 8, 106, 27, 113, 25, 26, 1309, 11, 34, 27, 1596, 1946, 2506, 18, 4, 2113, 13, 482, 10, 10, 14256, 193, 23, 5264, 9, 73, 224, 5, 1685, 4333, 29, 152, 340, 4313, 309, 39, 27, 592, 1648, 52, 272, 5, 3557, 1382, 21, 247, 43, 1608, 1193, 8828, 83, 4, 1493, 916, 42, 2625, 4613, 11, 4, 655, 9796, 3958, 5, 2117, 7038, 39, 8122, 1382, 8, 3787, 18792, 5, 619, 1680, 16043, 18, 5607, 12, 941, 25, 3099, 44, 27, 4400, 23, 27, 7617, 5, 89, 12, 80, 4525, 148, 187, 90, 45, 6, 227, 40, 2621, 8, 6, 3654, 1799, 15, 2036, 5, 5167, 1936, 6, 355, 854, 137, 29, 299, 21, 12, 495, 4, 108, 64, 85, 678, 217, 15, 7, 1738, 9, 24, 754, 17, 14411, 6832, 505, 11, 6, 1156, 48, 29732, 239, 17, 6, 1988, 914, 19, 6, 1988, 914, 113, 5, 712, 10, 10, 4, 22, 152, 66, 28, 101, 666, 7639, 42, 1983, 314, 27759, 1299, 21, 13, 286, 502, 8, 482, 4, 277, 5, 12, 421, 2349, 12, 152, 28, 101, 933, 4345, 42, 3580, 6102, 5, 246, 12, 421, 55, 406, 5, 12, 161, 28, 101, 483, 11960, 519, 3227, 42, 1056, 3353, 5, 246, 13, 197, 4, 1794, 16, 73, 224, 5, 13, 16, 115, 1100, 279, 4, 64, 147, 749, 133, 9, 44, 4, 406, 359, 8, 2101, 46, 5, 97, 5670, 19, 31, 160, 5, 89, 148, 738, 28, 57, 1515, 42657, 60, 6, 2748, 738, 369, 5, 60, 52, 84, 70, 30, 369, 19, 78, 84, 45, 6, 931, 23, 4, 680, 1526, 182, 75, 412, 11, 6, 52, 22, 290, 319]\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Pmh3y9oiPszQ", | |
"colab_type": "code", | |
"outputId": "aa402620-bb8a-419a-9ee0-523fd4f1959f", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"print(new_all[25000])" | |
], | |
"execution_count": 51, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"1\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "OV6dM6YpP6Du", | |
"colab_type": "code", | |
"outputId": "b90339a1-8c24-4797-8d07-caccb1e352ab", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 107 | |
} | |
}, | |
"source": [ | |
"print(new_all)" | |
], | |
"execution_count": 52, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"[list([1, 165, 14, 20, 16, 24, 38, 78, 12, 1367, 206, 212, 5, 2318, 50, 26, 52, 156, 11, 14, 22, 18, 1825, 7517, 39973, 18746, 39, 4, 1419, 3693, 37, 299, 18063, 160, 73, 573, 284, 9, 3546, 4224, 39, 2039, 5, 289, 8822, 4, 293, 105, 26, 256, 34, 3546, 5788, 17, 6184, 37, 16, 184, 52, 5, 82, 163, 21, 4, 31, 37, 91, 770, 72, 16, 628, 8335, 17, 4500, 39520, 29, 299, 6, 275, 109, 74, 29, 633, 127, 88, 11, 85, 108, 40, 4, 1419, 3693, 1395, 5808, 4, 31025, 42, 4, 43737, 29, 299, 6, 55, 2259, 415, 5, 11, 7242, 4, 299, 220, 4, 1961, 6, 132, 209, 101, 1438, 63, 16, 327, 8, 67, 4, 64, 66, 1566, 155, 44, 14, 22, 26, 4, 450, 1268, 7, 4, 182, 3331, 2216, 63, 166, 14, 22, 382, 168, 6, 117, 1967, 444, 13, 197, 14, 16, 6, 184, 52, 117, 22])\n", | |
" list([1, 4, 12475, 9, 6, 680, 22, 94, 293, 109, 5264, 256, 19, 35, 1732, 1493, 7, 1382, 5, 39453, 34, 3977, 9, 24, 129, 801, 632, 5264, 9, 6, 569, 132, 37, 9, 6606, 6, 522, 1696, 113, 18358, 260, 1084, 5284, 153, 11, 4, 5951, 7, 1043, 6448, 588, 29, 150, 659, 309, 11600, 46, 5, 2724, 2610, 5, 38, 103, 6, 580, 2180, 33, 6, 1449, 19, 1738, 5060, 6832, 29, 26797, 23, 5, 778, 6, 27792, 2094, 1862, 1738, 4, 7769, 327, 232, 9, 1951, 19, 49, 538, 11, 27, 205, 113, 5, 882, 30, 579, 100, 361, 6, 464, 17, 73, 4, 107, 97, 35, 2076, 2025, 5, 1738, 3775, 187, 8, 842, 21409, 65, 60, 103, 2799, 4, 13534, 882, 44, 21409, 157, 10, 10, 12475, 6077, 6, 875, 24, 340, 12916, 7, 11, 438, 4, 1210, 632, 5059, 108, 40, 79218, 5, 76289, 15587, 72568, 216, 8, 330, 21, 12475, 12677, 11, 450, 1317, 771, 86, 7, 32, 4, 880, 5, 599, 9, 6270, 21, 115, 66, 617, 11, 101, 1589, 1217, 15, 48, 25, 26, 35, 206, 20, 4301, 267, 18, 35, 10563, 3409, 14, 20, 80, 242, 4363, 25, 5, 333, 1025, 91, 1210, 632, 108, 12475, 166, 57, 589, 8, 123, 10506, 5, 3265, 39, 94, 293, 109, 21409, 292, 9, 331, 1353, 17, 35, 21175, 9, 51, 12, 25671, 243, 7, 155, 14, 9, 1732, 348, 15, 3816, 3816, 7, 178, 62, 1133, 880, 18, 278, 2993, 5, 246, 14, 12894, 1483, 9, 382, 51, 166, 4, 22, 235, 2902, 261, 75, 92, 40, 8, 974, 12, 220, 233, 100, 413, 4885, 103, 75, 122, 12, 196, 195, 279, 60, 588, 122, 21409, 1484, 1833, 8, 783, 37, 9, 2648, 8, 28, 84, 556, 37, 694, 4, 20, 115, 2033, 19, 134, 1204, 4, 1152, 9, 23, 5264, 5, 27, 9129, 12807, 83, 6, 2038, 1862, 48, 25, 332, 44, 294, 40, 5264, 11, 4, 2300, 25, 62, 28, 6, 2581, 197, 15, 84, 40, 90, 144, 30, 3314, 46, 7, 926, 40, 6, 5388, 21, 918, 8, 106, 27, 113, 25, 26, 1309, 11, 34, 27, 1596, 1946, 2506, 18, 4, 2113, 13, 482, 10, 10, 14256, 193, 23, 5264, 9, 73, 224, 5, 1685, 4333, 29, 152, 340, 4313, 309, 39, 27, 592, 1648, 52, 272, 5, 3557, 1382, 21, 247, 43, 1608, 1193, 8828, 83, 4, 1493, 916, 42, 2625, 4613, 11, 4, 655, 9796, 3958, 5, 2117, 7038, 39, 8122, 1382, 8, 3787, 18792, 5, 619, 1680, 16043, 18, 5607, 12, 941, 25, 3099, 44, 27, 4400, 23, 27, 7617, 5, 89, 12, 80, 4525, 148, 187, 90, 45, 6, 227, 40, 2621, 8, 6, 3654, 1799, 15, 2036, 5, 5167, 1936, 6, 355, 854, 137, 29, 299, 21, 12, 495, 4, 108, 64, 85, 678, 217, 15, 7, 1738, 9, 24, 754, 17, 14411, 6832, 505, 11, 6, 1156, 48, 29732, 239, 17, 6, 1988, 914, 19, 6, 1988, 914, 113, 5, 712, 10, 10, 4, 22, 152, 66, 28, 101, 666, 7639, 42, 1983, 314, 27759, 1299, 21, 13, 286, 502, 8, 482, 4, 277, 5, 12, 421, 2349, 12, 152, 28, 101, 933, 4345, 42, 3580, 6102, 5, 246, 12, 421, 55, 406, 5, 12, 161, 28, 101, 483, 11960, 519, 3227, 42, 1056, 3353, 5, 246, 13, 197, 4, 1794, 16, 73, 224, 5, 13, 16, 115, 1100, 279, 4, 64, 147, 749, 133, 9, 44, 4, 406, 359, 8, 2101, 46, 5, 97, 5670, 19, 31, 160, 5, 89, 148, 738, 28, 57, 1515, 42657, 60, 6, 2748, 738, 369, 5, 60, 52, 84, 70, 30, 369, 19, 78, 84, 45, 6, 931, 23, 4, 680, 1526, 182, 75, 412, 11, 6, 52, 22, 290, 319])\n", | |
" list([1, 13, 104, 14, 20, 69, 8, 30, 253, 8, 97, 12, 18, 178, 12, 16, 253, 8, 106, 12, 4, 156, 168, 40, 36, 28, 6, 253, 58, 61, 7576, 40, 4, 430, 156, 5, 61, 9344, 40, 4, 250, 156, 24, 55, 76, 81, 75, 79, 8, 28, 932, 253, 19, 6, 20, 15, 9, 189, 97, 13, 67, 6, 176, 7, 629, 102, 5, 13, 62, 106, 14, 31, 32, 295, 280, 53, 42, 53, 88, 75, 462, 295, 48, 14, 156, 97, 85, 629, 102, 13, 80, 106, 98, 4, 18468, 1168, 132, 1584, 1313, 8, 516, 4, 156, 9, 55, 76, 6, 52, 78, 132, 29, 97, 178, 462, 295, 4, 91, 13, 62, 202, 14, 20, 6, 312, 603, 48, 25, 942, 72, 10, 10, 13, 92, 124, 48, 4, 2336, 47, 101, 53, 7, 4, 102, 19, 4, 156, 21, 4, 293, 430, 9, 1036, 4, 284, 19, 4, 1758, 4539, 47, 8, 30, 24, 147, 36, 152, 168, 8, 147])\n", | |
" ... 0 0 0]\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "NIb00tz2QgDQ", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"#np.reshape(new_all,(25000,2))" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "U91vgiufR9_1", | |
"colab_type": "code", | |
"outputId": "7e4a9737-da77-4191-bcfe-91d7cc70d444", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"print(np.shape(x_train))" | |
], | |
"execution_count": 54, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"(25000,)\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "RSFFTDhVSC96", | |
"colab_type": "code", | |
"outputId": "732b5b8c-9491-42f8-ccd0-1d8e8b128512", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"print(np.shape(new_data))" | |
], | |
"execution_count": 55, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"(25000,)\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "4kBNuyMUOceq", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "I_vhHznMNp6P", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# 8.Classification Model (Syn Data)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "5oIfLNZUQ9So", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"We would like to use a word embedding representation for the IMDB dataset.\n", | |
"\n", | |
"Let's set our vocabulary size to be the word index \n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "cQjuZB-WRS6z", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"vocab_size = len(word_index)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "NFKSKeliWJAw", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"We will bound reviews at 500 words, truncating longer reviews and zero-padding shorter reviews." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "cB7f9F6gUmjY", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"#max_words = 500 #500 takes too long, reducing it down to 50\n", | |
"max_words = 50 # reducing further \n", | |
"max_words = 16 \n", | |
"\n", | |
"x_train_new = sequence.pad_sequences(new_data, maxlen=max_words,padding='post')\n", | |
"x_test = sequence.pad_sequences(x_test, maxlen=max_words,padding='post')" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "9-0VlMFqXxVp", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"The Embedding layer is defined as the first hidden layer of a network. It must specify 3 arguments:\n", | |
"\n", | |
"* input_dim: This is the size of the vocabulary in the text data. For example, if your data is integer encoded to values between 0-10, then the size of the vocabulary would be 11 words.\n", | |
"* output_dim: This is the size of the vector space in which words will be embedded. It defines the size of the output vectors from this layer for each word. For example, it could be 32 or 100 or even larger. Test different values for your problem.\n", | |
"* input_length: This is the length of input sequences, as you would define for any input layer of a Keras model. For example, if all of your input documents are comprised of 1000 words, this would be 1000.\n", | |
"\n", | |
"SRC: https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "LQ1EOSWAWTxa", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Now we can create our model. \n", | |
"\n", | |
"We will use an Embedding layer as the input layer, setting the\n", | |
"\n", | |
"* vocabulary = length of our dictionary stored in vocab_size \n", | |
"* word vector size = 8 dimensions (a vector space of dimensions in which words will be embedded)\n", | |
"* input_length = 16, stored in max_words\n", | |
"\n", | |
"The output of this first layer will be a 8×16 sized matrix " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ERCaMfh_V7Mr", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 303 | |
}, | |
"outputId": "407cc7cd-813f-42d3-f20d-6b909c8c72d2" | |
}, | |
"source": [ | |
"# create the model\n", | |
"#out_dim = 64 # too long, reducing it\n", | |
"#out_dim = 16 # 16 is still taking too long, reducing it furtehr\n", | |
"out_dim = 8\n", | |
"\n", | |
"# LSTM layer has 32 memory units.Also used dropout to prevent overfitting in this layer\n", | |
"mem_units = 32 #128\n", | |
"i_dropout = 0.2\n", | |
"\n", | |
"model = tf.keras.Sequential([\n", | |
" tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=out_dim, input_length=max_words),\n", | |
" tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(mem_units,dropout=i_dropout)),\n", | |
" tf.keras.layers.Dense(64, activation='relu'),\n", | |
" tf.keras.layers.Dense(1, activation='sigmoid')\n", | |
"])\n" | |
], | |
"execution_count": 58, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Call initializer instance with the dtype argument instead of passing it to the constructor\n", | |
"WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Call initializer instance with the dtype argument instead of passing it to the constructor\n", | |
"WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Call initializer instance with the dtype argument instead of passing it to the constructor\n", | |
"WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Call initializer instance with the dtype argument instead of passing it to the constructor\n", | |
"WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"If using Keras pass *_constraint arguments to layers.\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "WyfT9Th2eu3u", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 90 | |
}, | |
"outputId": "c9991433-1a95-4c66-824d-86af7f70efdc" | |
}, | |
"source": [ | |
"model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", | |
"#model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])" | |
], | |
"execution_count": 59, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/nn_impl.py:183: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Use tf.where in 2.0, which has the same broadcast rule as np.where\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "HZsnMhlhe5Ks", | |
"colab_type": "code", | |
"outputId": "15e02a07-61d6-4db7-a3f5-5a98b74e7ba2", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 301 | |
} | |
}, | |
"source": [ | |
"model.summary()" | |
], | |
"execution_count": 60, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Model: \"sequential\"\n", | |
"_________________________________________________________________\n", | |
"Layer (type) Output Shape Param # \n", | |
"=================================================================\n", | |
"embedding (Embedding) (None, 16, 8) 708704 \n", | |
"_________________________________________________________________\n", | |
"bidirectional (Bidirectional (None, 64) 10496 \n", | |
"_________________________________________________________________\n", | |
"dense (Dense) (None, 64) 4160 \n", | |
"_________________________________________________________________\n", | |
"dense_1 (Dense) (None, 1) 65 \n", | |
"=================================================================\n", | |
"Total params: 723,425\n", | |
"Trainable params: 723,425\n", | |
"Non-trainable params: 0\n", | |
"_________________________________________________________________\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "0JOcv1yReF8z", | |
"colab_type": "code", | |
"outputId": "089b667e-f68e-4997-bb94-f652faf42781", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 105 | |
} | |
}, | |
"source": [ | |
"# Fit the model\n", | |
"\n", | |
"i_batch_size = 1000\n", | |
"i_epochs = 2\n", | |
"\n", | |
"history = model.fit(x_train_new, new_labels, validation_data=(x_test, y_test), epochs=i_epochs, batch_size=i_batch_size, verbose=2)\n" | |
], | |
"execution_count": 61, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Train on 25000 samples, validate on 25000 samples\n", | |
"Epoch 1/2\n", | |
"25000/25000 - 5s - loss: 0.6916 - acc: 0.5478 - val_loss: 0.6870 - val_acc: 0.6312\n", | |
"Epoch 2/2\n", | |
"25000/25000 - 3s - loss: 0.6472 - acc: 0.6936 - val_loss: 0.5718 - val_acc: 0.7129\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "V0KutYqKesaz", | |
"colab_type": "code", | |
"outputId": "3ecb383d-ed61-4c21-b4af-6fa68b251f8e", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 70 | |
} | |
}, | |
"source": [ | |
"# Final evaluation of the model\n", | |
"scores = model.evaluate(x_test, y_test, verbose=2)\n", | |
"\n", | |
"for name, value in zip(model.metrics_names, scores):\n", | |
" print(\"%s: %.2f\" % (name, value))" | |
], | |
"execution_count": 62, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"25000/25000 - 3s - loss: 0.5718 - acc: 0.7129\n", | |
"loss: 0.57\n", | |
"acc: 0.71\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "jZlyAARAaeAp", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### Predictions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "xz-CQfHjnJYW", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"prediction = model.predict(x_test)\n", | |
"y_pred = (prediction > 0.5)\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "f-xqhXZHAl_p", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### Confusion Matrix" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "zCW7UU7kjULj", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"No intermediate prints here - refer to Comparison " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ni8e5hJU_QTV", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"target_names = ['pos', 'neg']\n", | |
"cnf_matrix_test = confusion_matrix(y_test, y_pred)\n", | |
"\n", | |
"y_test_syn = y_test\n", | |
"y_pred_syn = y_pred\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Th9RG6t-_vny", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# predict probabilities for test set\n", | |
"yhat_probs = model.predict(x_test)\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "aQHxAyPSDJ6q", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# predict crisp classes for test set\n", | |
"yhat_classes = model.predict_classes(x_test)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "1vICVUX1Dfs0", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# reduce to 1d array\n", | |
"yhat_probs = yhat_probs[:, 0]\n", | |
"yhat_classes = yhat_classes[:, 0]" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "yAACY5JuEKub", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"We are now ready to calculate metrics for our deep learning neural network model. We can start by calculating the classification accuracy, precision, recall, and F1 scores." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "2R_XBL4UD__9", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# accuracy: (tp + tn) / (p + n)\n", | |
"accuracy_syn = accuracy_score(y_test, yhat_classes)\n", | |
"\n", | |
"# precision tp / (tp + fp)\n", | |
"precision_syn = precision_score(y_test, yhat_classes)\n", | |
"\n", | |
"# recall: tp / (tp + fn)\n", | |
"recall_syn = recall_score(y_test, yhat_classes)\n", | |
"\n", | |
"# f1: 2 tp / (2 tp + fp + fn)\n", | |
"f1_syn = f1_score(y_test, yhat_classes)\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "O69-OaXYa_eO", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"#9.Classification Model (Orig Data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "1dy5d6OmEleG", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"x_train = sequence.pad_sequences(x_train, maxlen=max_words,padding='post')\n", | |
"# x_test was already processed above in #8, no need to redo\n", | |
"# x_test = sequence.pad_sequences(x_test, maxlen=max_words,padding='post') " | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "KsUMVti2bzRb", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"model was setup above, simply reusing it here" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "cqbbPNUpbr-G", | |
"colab_type": "code", | |
"outputId": "37725863-b266-4f7b-a6aa-e3c17cd0e112", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 105 | |
} | |
}, | |
"source": [ | |
"history_original = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=i_epochs, batch_size=i_batch_size, verbose=2)" | |
], | |
"execution_count": 70, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Train on 25000 samples, validate on 25000 samples\n", | |
"Epoch 1/2\n", | |
"25000/25000 - 4s - loss: 0.4812 - acc: 0.7727 - val_loss: 0.5080 - val_acc: 0.7507\n", | |
"Epoch 2/2\n", | |
"25000/25000 - 3s - loss: 0.3778 - acc: 0.8375 - val_loss: 0.5141 - val_acc: 0.7548\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Jksx1z5Rb9n1", | |
"colab_type": "code", | |
"outputId": "da12a777-aed4-45c5-8518-52e1cc16ea9b", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 70 | |
} | |
}, | |
"source": [ | |
"# Final evaluation of the model\n", | |
"scores = model.evaluate(x_test, y_test, verbose=2)\n", | |
"\n", | |
"for name, value in zip(model.metrics_names, scores):\n", | |
" print(\"%s: %.2f\" % (name, value))" | |
], | |
"execution_count": 71, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"25000/25000 - 3s - loss: 0.5141 - acc: 0.7548\n", | |
"loss: 0.51\n", | |
"acc: 0.75\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Y1Z0R6dtcHqi", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### Predictions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "GCUAeMHAcEzA", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"prediction = model.predict(x_test)\n", | |
"y_pred = (prediction > 0.5)\n", | |
"\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "-XN1tx6EceBf", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### Confusion Matrix" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "xMu5fJPmchu2", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"target_names = ['pos', 'neg']\n", | |
"cnf_matrix_test_original = confusion_matrix(y_test, y_pred)\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "wNePMkXSclEc", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# predict probabilities for test set\n", | |
"yhat_probs = model.predict(x_test)\n", | |
"\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "zBFSMsXoctZV", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"\n", | |
"# predict crisp classes for test set\n", | |
"yhat_classes = model.predict_classes(x_test)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "0jLYCyewcxLZ", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"\n", | |
"# reduce to 1d array\n", | |
"yhat_probs = yhat_probs[:, 0]\n", | |
"yhat_classes = yhat_classes[:, 0]" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "E5hMSaQqc1SF", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# accuracy: (tp + tn) / (p + n)\n", | |
"accuracy = accuracy_score(y_test, yhat_classes)\n", | |
"\n", | |
"# precision tp / (tp + fp)\n", | |
"precision = precision_score(y_test, yhat_classes)\n", | |
"\n", | |
"# recall: tp / (tp + fn)\n", | |
"recall = recall_score(y_test, yhat_classes)\n", | |
"\n", | |
"# f1: 2 tp / (2 tp + fp + fn)\n", | |
"f1 = f1_score(y_test, yhat_classes)\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "7hyLWxFwdLJC", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# 10.Comparison " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "esZ2yNkBecwY", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## Confusion Matrix\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "SZJ-X5bEkJ0B", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Synthetic Data " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "MnB0BYRyfZDa", | |
"colab_type": "code", | |
"outputId": "ddbd7c19-7742-4a8c-9b0e-51087475b9e0", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 52 | |
} | |
}, | |
"source": [ | |
"confusion_matrix(y_pred_syn, y_test_syn)" | |
], | |
"execution_count": 78, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"array([[8937, 3615],\n", | |
" [3563, 8885]])" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 78 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "sr-ohxI0kOs-", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Original Data " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Ik3Fbu-mhp2K", | |
"colab_type": "code", | |
"outputId": "ff1aa0fe-5f0a-47e5-df79-3106c0a67a2d", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 52 | |
} | |
}, | |
"source": [ | |
"confusion_matrix(y_pred, y_test)" | |
], | |
"execution_count": 79, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"array([[9294, 2925],\n", | |
" [3206, 9575]])" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 79 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "AAePSsxikTRx", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Synthetic Data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "4SacmkxBfo0G", | |
"colab_type": "code", | |
"outputId": "185885b8-3689-4111-b0e8-1a48cb11b1c9", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 176 | |
} | |
}, | |
"source": [ | |
"print(classification_report(y_test_syn, y_pred_syn, target_names=target_names))\n" | |
], | |
"execution_count": 80, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
" precision recall f1-score support\n", | |
"\n", | |
" pos 0.71 0.71 0.71 12500\n", | |
" neg 0.71 0.71 0.71 12500\n", | |
"\n", | |
" accuracy 0.71 25000\n", | |
" macro avg 0.71 0.71 0.71 25000\n", | |
"weighted avg 0.71 0.71 0.71 25000\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "mHaQcSIWkWsX", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Original Data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "pZOMhRZ0hvGT", | |
"colab_type": "code", | |
"outputId": "3098b19d-6f0b-4e63-a2aa-4a26e4a7c463", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 176 | |
} | |
}, | |
"source": [ | |
"print(classification_report(y_test, y_pred, target_names=target_names))\n" | |
], | |
"execution_count": 81, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
" precision recall f1-score support\n", | |
"\n", | |
" pos 0.76 0.74 0.75 12500\n", | |
" neg 0.75 0.77 0.76 12500\n", | |
"\n", | |
" accuracy 0.75 25000\n", | |
" macro avg 0.75 0.75 0.75 25000\n", | |
"weighted avg 0.75 0.75 0.75 25000\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "mqTq8Slfkg6R", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## Metrics" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "X6DIkY4ngtcr", | |
"colab_type": "code", | |
"outputId": "c4a80950-8b21-4b64-cb1d-67b9d873ba86", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 265 | |
} | |
}, | |
"source": [ | |
"# accuracy: (tp + tn) / (p + n)\n", | |
"print('Accuracy of Synthetic: %f' % accuracy_syn)\n", | |
"print('Accuracy of Original: %f' % accuracy)\n", | |
"print(\"\\n\")\n", | |
"\n", | |
"# precision tp / (tp + fp)\n", | |
"print('Precision of Synthetic: %f' % precision_syn)\n", | |
"print('Precision of Original: %f' % precision)\n", | |
"print(\"\\n\")\n", | |
"\n", | |
"# recall: tp / (tp + fn)\n", | |
"print('Recall of Synthetic: %f' % recall_syn)\n", | |
"print('Recall of Original: %f' % recall)\n", | |
"print(\"\\n\")\n", | |
"\n", | |
"# f1: 2 tp / (2 tp + fp + fn)\n", | |
"print('F1 score of Synthetic: %f' % f1_syn)\n", | |
"print('F1 score of Original: %f' % f1)" | |
], | |
"execution_count": 82, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Accuracy of Synthetic: 0.712880\n", | |
"Accuracy of Original: 0.754760\n", | |
"\n", | |
"\n", | |
"Precision of Synthetic: 0.713769\n", | |
"Precision of Original: 0.749159\n", | |
"\n", | |
"\n", | |
"Recall of Synthetic: 0.710800\n", | |
"Recall of Original: 0.766000\n", | |
"\n", | |
"\n", | |
"F1 score of Synthetic: 0.712282\n", | |
"F1 score of Original: 0.757486\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "f47oFWAZexvh", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"" | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment