patternproject/Wk4_Submisison.ipynb

## Wk4_Submisison.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Gamma.ipynb",
      "provenance": [],
      "collapsed_sections": [
        "I_LJ_VmN20qo",
        "y43HipKZ275r",
        "sPY9Z38C3dEQ",
        "q_Fv00Lf3gby",
        "ZJbgtg-S3rYL",
        "l_zz9r6Y322G",
        "PECSzyLI37LZ",
        "MbY0wZMBN36X",
        "4VNdFGvGN9Jm"
      ],
      "toc_visible": true,
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/patternproject/6c917b3d58b852399ce6e55001e5db5e/gamma.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "QazQ6ZyQR03W",
        "colab_type": "text"
      },
      "source": [
        "Manning LP \n",
        "\"Classifying Customer Feedback with Imbalanced Text Data\""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "3o9qTS480De0",
        "colab_type": "text"
      },
      "source": [
        "Wk4 -  Training with Generated Corpus\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Uo43KfuKbwEp",
        "colab_type": "text"
      },
      "source": [
        "# Action Starts \n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "P110x3WSeLFT",
        "colab_type": "text"
      },
      "source": [
        "## Import Libraries"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "CZFd6Z5veN3F",
        "colab_type": "code",
        "outputId": "2da41aa4-d9cb-4f44-8800-4389ced8403b",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 63
        }
      },
      "source": [
        "from __future__ import absolute_import, division, print_function\n",
        "import os\n",
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "import pandas as pd\n",
        "\n",
        "import pickle\n",
        "\n",
        "import tensorflow as tf\n",
        "from tensorflow.keras import layers\n",
        "\n",
        "from sklearn.model_selection import train_test_split\n",
        "\n",
        "from sklearn.metrics import classification_report"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "<p style=\"color: red;\">\n",
              "The default version of TensorFlow in Colab will soon switch to TensorFlow 2.x.<br>\n",
              "We recommend you <a href=\"https://www.tensorflow.org/guide/migrate\" target=\"_blank\">upgrade</a> now \n",
              "or ensure your notebook will continue to use TensorFlow 1.x via the <code>%tensorflow_version 1.x</code> magic:\n",
              "<a href=\"https://colab.research.google.com/notebooks/tensorflow_version.ipynb\" target=\"_blank\">more info</a>.</p>\n"
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {
            "tags": []
          }
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "CSqwXayYK2TT",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "keras = tf.keras"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Wb6MfEydKoEt",
        "colab_type": "text"
      },
      "source": [
        "### Original Data Set"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "03uDW0UMK8pD",
        "colab_type": "code",
        "outputId": "a8d1d1d5-d9b4-49fb-9cbf-f451ac25450d",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "imdb = keras.datasets.imdb\n",
        "print(tf.__version__)"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "1.15.0\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oXEGjt5OKr4U",
        "colab_type": "code",
        "outputId": "9fbd8a48-e1f3-49d2-c30f-e016be3ec8f5",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 52
        }
      },
      "source": [
        "(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(\n",
        "    path='imdb.npz',  # download to '~/.keras/datasets/' + path\n",
        "    num_words=None,   # top most frequent words to consider\n",
        "    skip_top=0,       # top most frequent words to ignore ('the', 'a', 'at', ...)\n",
        "    maxlen=None,      # truncate reviews longer than this\n",
        "    seed=113,         # data shuffling seed\n",
        "    start_char=1,     # start-of-sequence token\n",
        "    oov_char=2,       # if skip_top used, then dropped words replaced with this token\n",
        "    index_from=3      # actual word tokens start here\n",
        ")"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz\n",
            "17465344/17464789 [==============================] - 0s 0us/step\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "C1VpXOrnKnjD",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "5bAnbedYLXcy",
        "colab_type": "text"
      },
      "source": [
        "### Dictionary Setup for Word Lookup"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "T9YdaCumLbIU",
        "colab_type": "code",
        "outputId": "b90b3833-652c-46fa-af33-3748f9d1262b",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 52
        }
      },
      "source": [
        "word_index = tf.keras.datasets.imdb.get_word_index(path='imdb_word_index.json')"
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json\n",
            "1646592/1641221 [==============================] - 0s 0us/step\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "W6zByW_GLfgU",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Nk_ZGpvKLkDZ",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "\n",
        "# Parameters for dict.get()\n",
        "## key − This is the Key to be searched in the dictionary.\n",
        "## default − This is the Value to be returned in case key does not exist.\n",
        "\n",
        "def decode_review(text_indexes):\n",
        "  # text_indexes means int mapping\n",
        "    return ' '.join([reverse_word_index.get(i, '?') for i in text_indexes])"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "v-osseo3P8Xb",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "\n",
        "# The first indices are reserved\n",
        "word_index = {k:(v+3) for k,v in word_index.items()} \n",
        "word_index[\"<PAD>\"] = 0\n",
        "word_index[\"<START>\"] = 1\n",
        "word_index[\"<UNK>\"] = 2  # unknown\n",
        "word_index[\"<UNUSED>\"] = 3"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "g9vgBndud3Uh",
        "colab_type": "text"
      },
      "source": [
        "## Merge corpus for training\n",
        "\n",
        "There are three separate training corpora to this point of the liveProject: generated, subset positive, and all negative. Not only the corpora, but also their respective labels have to be merged into one NumPy array. Once merged, this constitutes the training data to a text classification model. You can use np.concatenate() to perform this step."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hUO8bqAHd9p9",
        "colab_type": "text"
      },
      "source": [
        "### Generated Positive"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "WUYUYsT1T7Zv",
        "colab_type": "text"
      },
      "source": [
        "upload the file \"pos_reviews.pkl\""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "yU_D8Bl2XdJk",
        "colab_type": "text"
      },
      "source": [
        "this is only 10 synthetic reviews - the other notebook is still running where we try to generate 6250 reviews"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "m0NTAer_Y1ME",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "filename = '/content/sample_data/pos_reviews.pkl'\n",
        "infile = open(filename,'rb')\n",
        "pos_generated = pickle.load(infile)\n",
        "infile.close()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xP_AKZ62vRX7",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "i_shape = (np.shape(pos_generated))[0]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "1DNAtidQvMq5",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "pos_generated = np.reshape(pos_generated,(i_shape,1))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OqKWd0a3eFpA",
        "colab_type": "text"
      },
      "source": [
        "### Subset Positive"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "I_LadARTurkH",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "filename = '/content/sample_data/pos_remaining.pkl'\n",
        "infile = open(filename,'rb')\n",
        "pos_remaining = pickle.load(infile)\n",
        "infile.close()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "GeGqFF1Az_Rb",
        "colab_type": "text"
      },
      "source": [
        "### Merge both pos together"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "-A4dgjgzz-3U",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "pos_reviews = np.concatenate((pos_generated,pos_remaining))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jJjXSXOD11c2",
        "colab_type": "text"
      },
      "source": [
        "### Add Label Column (Y)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "-lJ6a1o813zq",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "i_shape = (np.shape(pos_reviews))[0]\n",
        "y_pos = np.ones((i_shape,1),dtype=int) "
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2gj7xlai2LUm",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "pos_all = np.concatenate((pos_reviews,y_pos),axis = 1)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "NnYys-AkeIUJ",
        "colab_type": "text"
      },
      "source": [
        "### All Negative"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "zdAeYF50eJja",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "filename = '/content/sample_data/neg_reviews.pkl'\n",
        "infile = open(filename,'rb')\n",
        "neg_reviews = pickle.load(infile)\n",
        "infile.close()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "jH1hDHEL3tgG",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "i_shape = (np.shape(neg_reviews))[0]\n",
        "neg_reviews = np.reshape(neg_reviews,(i_shape,1))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "DofhaBAv3Aee",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "i_shape = (np.shape(neg_reviews))[0]\n",
        "y_neg = np.zeros((i_shape,1),dtype=int) "
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "FuDfcRnq3Fv_",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "neg_all = np.concatenate((neg_reviews,y_neg),axis = 1)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BnE41wFB3WK7",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "rev_all = np.concatenate((pos_all,neg_all))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "CXbxsf0C4i9s",
        "colab_type": "text"
      },
      "source": [
        "Writing to a pickle file for later"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "UxxVO54G4foH",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "filename = '/content/sample_data/all_reviews.pkl'\n",
        "outfile = open(filename,'wb')\n",
        "pickle.dump(rev_all,outfile)\n",
        "outfile.close()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hdCxlRXHOthd",
        "colab_type": "text"
      },
      "source": [
        "### Split in test and train"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BbYZ-L7rOwzR",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "X = rev_all[:,0]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "gFm8FmYUPKwx",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "y = rev_all[:,-1]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "aitn0vQKPgjw",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#from sklearn.model_selection import train_test_split\n",
        "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "urozKFaBOhdZ",
        "colab_type": "text"
      },
      "source": [
        "### Padding"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7D54_UKXOjkg",
        "colab_type": "text"
      },
      "source": [
        "Lets pad each sentence to maximimum length of 256 words. We may take advantage of pad_sequences function provided to speed simplify our task. We will pad sentences with <PAD> token up to 256 words."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "7ZFYIL0gOio9",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "train_data = tf.keras.preprocessing.sequence.pad_sequences(X_train,\n",
        "                                                        value=word_index[\"<PAD>\"],\n",
        "                                                        padding='post',\n",
        "                                                        maxlen=256)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "adzFvE-3RNyw",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "test_data = tf.keras.preprocessing.sequence.pad_sequences(X_test,\n",
        "                                                       value=word_index[\"<PAD>\"],\n",
        "                                                       padding='post',\n",
        "                                                       maxlen=256)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "_Xh2H-ic5S4G",
        "colab_type": "text"
      },
      "source": [
        "# Model Architecture\n",
        "\n",
        "Lets build a simple text classification model. Start with embedding layer that convert a word into multi-dimensional vector representation. Then we feed that representation to a bidirectional Long-Short Terms Memory cell (LSTM) that uses 128 (a hyperparameter - arbitrarily chosen, feel free to experiment) dimensions to represent text sequence, follow by a dense layer to aggregate the LSTM output before making a classification."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "8SFP-FMb4uVP",
        "colab_type": "code",
        "outputId": "af467723-82c8-4b02-ab47-5f6b8120f930",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 587
        }
      },
      "source": [
        "# input shape is the vocabulary count used for the movie reviews (10,000 words)\n",
        "vocab_size = len(word_index)\n",
        "\n",
        "MAX_SENTENCE_LENGTH=256\n",
        "EMBEDDING_SIZE=16\n",
        "HIDDEN_LAYER_SIZE=64\n",
        "model = tf.keras.Sequential([\n",
        "    tf.keras.layers.Embedding(vocab_size, 64),\n",
        "    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),\n",
        "    tf.keras.layers.Dense(64, activation='relu'),\n",
        "    tf.keras.layers.Dense(1, activation='sigmoid')\n",
        "])\n",
        "\n",
        "model.summary()"
      ],
      "execution_count": 28,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "WARNING:tensorflow:From /tensorflow-1.15.0/python3.6/tensorflow_core/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
            "Instructions for updating:\n",
            "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
            "WARNING:tensorflow:From /tensorflow-1.15.0/python3.6/tensorflow_core/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
            "Instructions for updating:\n",
            "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
            "WARNING:tensorflow:From /tensorflow-1.15.0/python3.6/tensorflow_core/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
            "Instructions for updating:\n",
            "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
            "WARNING:tensorflow:From /tensorflow-1.15.0/python3.6/tensorflow_core/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
            "Instructions for updating:\n",
            "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
            "WARNING:tensorflow:From /tensorflow-1.15.0/python3.6/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n",
            "Instructions for updating:\n",
            "If using Keras pass *_constraint arguments to layers.\n",
            "Model: \"sequential\"\n",
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "embedding (Embedding)        (None, None, 64)          5669632   \n",
            "_________________________________________________________________\n",
            "bidirectional (Bidirectional (None, 256)               197632    \n",
            "_________________________________________________________________\n",
            "dense (Dense)                (None, 64)                16448     \n",
            "_________________________________________________________________\n",
            "dense_1 (Dense)              (None, 1)                 65        \n",
            "=================================================================\n",
            "Total params: 5,883,777\n",
            "Trainable params: 5,883,777\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "3OTuQ3-D5V7y",
        "colab_type": "code",
        "outputId": "5c7be8c7-bdbc-491c-f807-693c0c8049b0",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 90
        }
      },
      "source": [
        "model.compile(optimizer='adam',\n",
        "              loss='binary_crossentropy',\n",
        "              metrics=['acc'])"
      ],
      "execution_count": 29,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "WARNING:tensorflow:From /tensorflow-1.15.0/python3.6/tensorflow_core/python/ops/nn_impl.py:183: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n",
            "Instructions for updating:\n",
            "Use tf.where in 2.0, which has the same broadcast rule as np.where\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "t9DD99WpOAZc",
        "colab_type": "text"
      },
      "source": [
        "### Cross Validation"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0KcEviMrN72J",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# Shuffle training data for cross validation during training cycle\n",
        "FRAC = 0.8 # fraction of training data used for training. Remaining is for cross validation.\n",
        "idx = np.arange(len(train_data))\n",
        "np.random.shuffle(idx)\n",
        "\n",
        "idxs = idx[:round(len(idx)*FRAC)] # Select random 80% for training data\n",
        "partial_x_train = train_data[idxs]\n",
        "partial_y_train = y_train[idxs]\n",
        "\n",
        "x_val = np.delete(train_data, idxs.tolist(), axis=0) # select remaining as cross validation data\n",
        "y_val = np.delete(y_train, idxs.tolist(), axis=0)\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "kUIF0Od4Uk0m",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#i_epochs=40\n",
        "i_epochs=2"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "eCIM2-FpOCWi",
        "colab_type": "code",
        "outputId": "1989e11a-4bbb-4219-f557-417663e3ab3e",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 105
        }
      },
      "source": [
        "history = model.fit(partial_x_train,\n",
        "                    partial_y_train,\n",
        "                    epochs=i_epochs,\n",
        "                    batch_size=512,\n",
        "                    validation_data=(x_val, y_val),\n",
        "                    verbose=1)"
      ],
      "execution_count": 32,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Train on 12735 samples, validate on 3184 samples\n",
            "Epoch 1/2\n",
            "12735/12735 [==============================] - 165s 13ms/sample - loss: 0.6895 - acc: 0.5335 - val_loss: 0.6649 - val_acc: 0.6002\n",
            "Epoch 2/2\n",
            "12735/12735 [==============================] - 162s 13ms/sample - loss: 0.5804 - acc: 0.7432 - val_loss: 0.4599 - val_acc: 0.7827\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "PesiNzwzOGRT",
        "colab_type": "code",
        "outputId": "f4941f58-d835-4720-b496-fa04a6ae1b0d",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "results = model.evaluate(test_data, y_test)"
      ],
      "execution_count": 33,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "7841/7841 [==============================] - 35s 4ms/sample - loss: 0.4613 - acc: 0.7831\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "KypfpQ-LOKVJ",
        "colab_type": "code",
        "outputId": "9bb2803b-e3e5-455f-966f-ab2273471c3d",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "print(model.metrics_names)"
      ],
      "execution_count": 34,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "['loss', 'acc']\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oxvFxS68OL3i",
        "colab_type": "code",
        "outputId": "c89da437-1b56-49c1-9285-c7f4a1e37ebb",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "history_dict = history.history\n",
        "history_dict.keys()"
      ],
      "execution_count": 35,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "dict_keys(['loss', 'acc', 'val_loss', 'val_acc'])"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 35
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "qqHexpFkR8E2",
        "colab_type": "text"
      },
      "source": [
        "## Prediction on test dataset\n",
        "\n",
        "Lets create a confusion matrix to see how the model perform with respect to each review type."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "6nJxJf3SZOIh",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "predicted = model.predict(test_data)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "7M2MGDX1R9oP",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "predicted[predicted > 0.5] = 1\n",
        "predicted[predicted <= 0.5] = 0\n",
        "predictedf = predicted.flatten().astype(int)\n",
        "\n",
        "#import pandas as pd\n",
        "df3 = pd.DataFrame(data=predictedf, columns=['predicted'])\n",
        "refdf = pd.DataFrame(data=y_test, columns=['actual'])\n",
        "\n",
        "y_actu = pd.Series(refdf['actual'], name='ACTUAL')\n",
        "y_pred = pd.Series(df3['predicted'], name='PREDICTED')\n",
        "predicted_results = y_pred.tolist()\n",
        "truth = y_actu.tolist()\n",
        "\n",
        "dl_confusion = pd.crosstab(y_actu, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0R7mHPtHSGht",
        "colab_type": "code",
        "outputId": "038f3dfb-1e16-4d9f-fc5d-d1fd743fac22",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 167
        }
      },
      "source": [
        "dl_confusion"
      ],
      "execution_count": 38,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th>Predicted</th>\n",
              "      <th>0</th>\n",
              "      <th>1</th>\n",
              "      <th>All</th>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>Actual</th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>2881</td>\n",
              "      <td>1249</td>\n",
              "      <td>4130</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>452</td>\n",
              "      <td>3259</td>\n",
              "      <td>3711</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>All</th>\n",
              "      <td>3333</td>\n",
              "      <td>4508</td>\n",
              "      <td>7841</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "Predicted     0     1   All\n",
              "Actual                     \n",
              "0          2881  1249  4130\n",
              "1           452  3259  3711\n",
              "All        3333  4508  7841"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 38
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "yTMupZb7SJSg",
        "colab_type": "text"
      },
      "source": [
        "Lets take a closer look at model performance for each type of review."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "_PH0mGLMSLxt",
        "colab_type": "code",
        "outputId": "2504a2b8-5e2f-460b-8c95-79ab0ac835cf",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 176
        }
      },
      "source": [
        "\n",
        "report = classification_report(truth, predicted_results)\n",
        "print(report)"
      ],
      "execution_count": 39,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "              precision    recall  f1-score   support\n",
            "\n",
            "           0       0.86      0.70      0.77      4130\n",
            "           1       0.72      0.88      0.79      3711\n",
            "\n",
            "    accuracy                           0.78      7841\n",
            "   macro avg       0.79      0.79      0.78      7841\n",
            "weighted avg       0.80      0.78      0.78      7841\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "V2be2yxrZao3",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}