Last active
March 23, 2020 17:40
-
-
Save patternproject/6c917b3d58b852399ce6e55001e5db5e to your computer and use it in GitHub Desktop.
Wk4_Submisison.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Gamma.ipynb", | |
"provenance": [], | |
"collapsed_sections": [ | |
"I_LJ_VmN20qo", | |
"y43HipKZ275r", | |
"sPY9Z38C3dEQ", | |
"q_Fv00Lf3gby", | |
"ZJbgtg-S3rYL", | |
"l_zz9r6Y322G", | |
"PECSzyLI37LZ", | |
"MbY0wZMBN36X", | |
"4VNdFGvGN9Jm" | |
], | |
"toc_visible": true, | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/patternproject/6c917b3d58b852399ce6e55001e5db5e/gamma.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "QazQ6ZyQR03W", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Manning LP \n", | |
"\"Classifying Customer Feedback with Imbalanced Text Data\"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "3o9qTS480De0", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Wk4 - Training with Generated Corpus\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Uo43KfuKbwEp", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# Action Starts \n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "P110x3WSeLFT", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## Import Libraries" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "CZFd6Z5veN3F", | |
"colab_type": "code", | |
"outputId": "2da41aa4-d9cb-4f44-8800-4389ced8403b", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 63 | |
} | |
}, | |
"source": [ | |
"from __future__ import absolute_import, division, print_function\n", | |
"import os\n", | |
"import numpy as np\n", | |
"import matplotlib.pyplot as plt\n", | |
"\n", | |
"import pandas as pd\n", | |
"\n", | |
"import pickle\n", | |
"\n", | |
"import tensorflow as tf\n", | |
"from tensorflow.keras import layers\n", | |
"\n", | |
"from sklearn.model_selection import train_test_split\n", | |
"\n", | |
"from sklearn.metrics import classification_report" | |
], | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/html": [ | |
"<p style=\"color: red;\">\n", | |
"The default version of TensorFlow in Colab will soon switch to TensorFlow 2.x.<br>\n", | |
"We recommend you <a href=\"https://www.tensorflow.org/guide/migrate\" target=\"_blank\">upgrade</a> now \n", | |
"or ensure your notebook will continue to use TensorFlow 1.x via the <code>%tensorflow_version 1.x</code> magic:\n", | |
"<a href=\"https://colab.research.google.com/notebooks/tensorflow_version.ipynb\" target=\"_blank\">more info</a>.</p>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "CSqwXayYK2TT", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"keras = tf.keras" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Wb6MfEydKoEt", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### Original Data Set" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "03uDW0UMK8pD", | |
"colab_type": "code", | |
"outputId": "a8d1d1d5-d9b4-49fb-9cbf-f451ac25450d", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"imdb = keras.datasets.imdb\n", | |
"print(tf.__version__)" | |
], | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"1.15.0\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "oXEGjt5OKr4U", | |
"colab_type": "code", | |
"outputId": "9fbd8a48-e1f3-49d2-c30f-e016be3ec8f5", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 52 | |
} | |
}, | |
"source": [ | |
"(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(\n", | |
" path='imdb.npz', # download to '~/.keras/datasets/' + path\n", | |
" num_words=None, # top most frequent words to consider\n", | |
" skip_top=0, # top most frequent words to ignore ('the', 'a', 'at', ...)\n", | |
" maxlen=None, # truncate reviews longer than this\n", | |
" seed=113, # data shuffling seed\n", | |
" start_char=1, # start-of-sequence token\n", | |
" oov_char=2, # if skip_top used, then dropped words replaced with this token\n", | |
" index_from=3 # actual word tokens start here\n", | |
")" | |
], | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz\n", | |
"17465344/17464789 [==============================] - 0s 0us/step\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "C1VpXOrnKnjD", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "5bAnbedYLXcy", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### Dictionary Setup for Word Lookup" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "T9YdaCumLbIU", | |
"colab_type": "code", | |
"outputId": "b90b3833-652c-46fa-af33-3748f9d1262b", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 52 | |
} | |
}, | |
"source": [ | |
"word_index = tf.keras.datasets.imdb.get_word_index(path='imdb_word_index.json')" | |
], | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json\n", | |
"1646592/1641221 [==============================] - 0s 0us/step\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "W6zByW_GLfgU", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Nk_ZGpvKLkDZ", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"\n", | |
"# Parameters for dict.get()\n", | |
"## key − This is the Key to be searched in the dictionary.\n", | |
"## default − This is the Value to be returned in case key does not exist.\n", | |
"\n", | |
"def decode_review(text_indexes):\n", | |
" # text_indexes means int mapping\n", | |
" return ' '.join([reverse_word_index.get(i, '?') for i in text_indexes])" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "v-osseo3P8Xb", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"\n", | |
"# The first indices are reserved\n", | |
"word_index = {k:(v+3) for k,v in word_index.items()} \n", | |
"word_index[\"<PAD>\"] = 0\n", | |
"word_index[\"<START>\"] = 1\n", | |
"word_index[\"<UNK>\"] = 2 # unknown\n", | |
"word_index[\"<UNUSED>\"] = 3" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "g9vgBndud3Uh", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## Merge corpus for training\n", | |
"\n", | |
"There are three separate training corpora to this point of the liveProject: generated, subset positive, and all negative. Not only the corpora, but also their respective labels have to be merged into one NumPy array. Once merged, this constitutes the training data to a text classification model. You can use np.concatenate() to perform this step." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "hUO8bqAHd9p9", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### Generated Positive" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "WUYUYsT1T7Zv", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"upload the file \"pos_reviews.pkl\"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "yU_D8Bl2XdJk", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"this is only 10 synthetic reviews - the other notebook is still running where we try to generate 6250 reviews" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "m0NTAer_Y1ME", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"filename = '/content/sample_data/pos_reviews.pkl'\n", | |
"infile = open(filename,'rb')\n", | |
"pos_generated = pickle.load(infile)\n", | |
"infile.close()" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "xP_AKZ62vRX7", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"i_shape = (np.shape(pos_generated))[0]" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "1DNAtidQvMq5", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"pos_generated = np.reshape(pos_generated,(i_shape,1))" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "OqKWd0a3eFpA", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### Subset Positive" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "I_LadARTurkH", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"filename = '/content/sample_data/pos_remaining.pkl'\n", | |
"infile = open(filename,'rb')\n", | |
"pos_remaining = pickle.load(infile)\n", | |
"infile.close()" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "GeGqFF1Az_Rb", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### Merge both pos together" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "-A4dgjgzz-3U", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"pos_reviews = np.concatenate((pos_generated,pos_remaining))" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "jJjXSXOD11c2", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### Add Label Column (Y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "-lJ6a1o813zq", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"i_shape = (np.shape(pos_reviews))[0]\n", | |
"y_pos = np.ones((i_shape,1),dtype=int) " | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "2gj7xlai2LUm", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"pos_all = np.concatenate((pos_reviews,y_pos),axis = 1)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "NnYys-AkeIUJ", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### All Negative" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "zdAeYF50eJja", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"filename = '/content/sample_data/neg_reviews.pkl'\n", | |
"infile = open(filename,'rb')\n", | |
"neg_reviews = pickle.load(infile)\n", | |
"infile.close()" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "jH1hDHEL3tgG", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"i_shape = (np.shape(neg_reviews))[0]\n", | |
"neg_reviews = np.reshape(neg_reviews,(i_shape,1))" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "DofhaBAv3Aee", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"i_shape = (np.shape(neg_reviews))[0]\n", | |
"y_neg = np.zeros((i_shape,1),dtype=int) " | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "FuDfcRnq3Fv_", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"neg_all = np.concatenate((neg_reviews,y_neg),axis = 1)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "BnE41wFB3WK7", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"rev_all = np.concatenate((pos_all,neg_all))" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "CXbxsf0C4i9s", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Writing to a pickle file for later" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "UxxVO54G4foH", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"filename = '/content/sample_data/all_reviews.pkl'\n", | |
"outfile = open(filename,'wb')\n", | |
"pickle.dump(rev_all,outfile)\n", | |
"outfile.close()" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "hdCxlRXHOthd", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### Split in test and train" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "BbYZ-L7rOwzR", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"X = rev_all[:,0]" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "gFm8FmYUPKwx", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"y = rev_all[:,-1]" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "aitn0vQKPgjw", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"#from sklearn.model_selection import train_test_split\n", | |
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "urozKFaBOhdZ", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### Padding" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "7D54_UKXOjkg", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Lets pad each sentence to maximimum length of 256 words. We may take advantage of pad_sequences function provided to speed simplify our task. We will pad sentences with <PAD> token up to 256 words." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "7ZFYIL0gOio9", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"train_data = tf.keras.preprocessing.sequence.pad_sequences(X_train,\n", | |
" value=word_index[\"<PAD>\"],\n", | |
" padding='post',\n", | |
" maxlen=256)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "adzFvE-3RNyw", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"test_data = tf.keras.preprocessing.sequence.pad_sequences(X_test,\n", | |
" value=word_index[\"<PAD>\"],\n", | |
" padding='post',\n", | |
" maxlen=256)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "_Xh2H-ic5S4G", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# Model Architecture\n", | |
"\n", | |
"Lets build a simple text classification model. Start with embedding layer that convert a word into multi-dimensional vector representation. Then we feed that representation to a bidirectional Long-Short Terms Memory cell (LSTM) that uses 128 (a hyperparameter - arbitrarily chosen, feel free to experiment) dimensions to represent text sequence, follow by a dense layer to aggregate the LSTM output before making a classification." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "8SFP-FMb4uVP", | |
"colab_type": "code", | |
"outputId": "af467723-82c8-4b02-ab47-5f6b8120f930", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 587 | |
} | |
}, | |
"source": [ | |
"# input shape is the vocabulary count used for the movie reviews (10,000 words)\n", | |
"vocab_size = len(word_index)\n", | |
"\n", | |
"MAX_SENTENCE_LENGTH=256\n", | |
"EMBEDDING_SIZE=16\n", | |
"HIDDEN_LAYER_SIZE=64\n", | |
"model = tf.keras.Sequential([\n", | |
" tf.keras.layers.Embedding(vocab_size, 64),\n", | |
" tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),\n", | |
" tf.keras.layers.Dense(64, activation='relu'),\n", | |
" tf.keras.layers.Dense(1, activation='sigmoid')\n", | |
"])\n", | |
"\n", | |
"model.summary()" | |
], | |
"execution_count": 28, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"WARNING:tensorflow:From /tensorflow-1.15.0/python3.6/tensorflow_core/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Call initializer instance with the dtype argument instead of passing it to the constructor\n", | |
"WARNING:tensorflow:From /tensorflow-1.15.0/python3.6/tensorflow_core/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Call initializer instance with the dtype argument instead of passing it to the constructor\n", | |
"WARNING:tensorflow:From /tensorflow-1.15.0/python3.6/tensorflow_core/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Call initializer instance with the dtype argument instead of passing it to the constructor\n", | |
"WARNING:tensorflow:From /tensorflow-1.15.0/python3.6/tensorflow_core/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Call initializer instance with the dtype argument instead of passing it to the constructor\n", | |
"WARNING:tensorflow:From /tensorflow-1.15.0/python3.6/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"If using Keras pass *_constraint arguments to layers.\n", | |
"Model: \"sequential\"\n", | |
"_________________________________________________________________\n", | |
"Layer (type) Output Shape Param # \n", | |
"=================================================================\n", | |
"embedding (Embedding) (None, None, 64) 5669632 \n", | |
"_________________________________________________________________\n", | |
"bidirectional (Bidirectional (None, 256) 197632 \n", | |
"_________________________________________________________________\n", | |
"dense (Dense) (None, 64) 16448 \n", | |
"_________________________________________________________________\n", | |
"dense_1 (Dense) (None, 1) 65 \n", | |
"=================================================================\n", | |
"Total params: 5,883,777\n", | |
"Trainable params: 5,883,777\n", | |
"Non-trainable params: 0\n", | |
"_________________________________________________________________\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "3OTuQ3-D5V7y", | |
"colab_type": "code", | |
"outputId": "5c7be8c7-bdbc-491c-f807-693c0c8049b0", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 90 | |
} | |
}, | |
"source": [ | |
"model.compile(optimizer='adam',\n", | |
" loss='binary_crossentropy',\n", | |
" metrics=['acc'])" | |
], | |
"execution_count": 29, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"WARNING:tensorflow:From /tensorflow-1.15.0/python3.6/tensorflow_core/python/ops/nn_impl.py:183: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Use tf.where in 2.0, which has the same broadcast rule as np.where\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "t9DD99WpOAZc", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### Cross Validation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "0KcEviMrN72J", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# Shuffle training data for cross validation during training cycle\n", | |
"FRAC = 0.8 # fraction of training data used for training. Remaining is for cross validation.\n", | |
"idx = np.arange(len(train_data))\n", | |
"np.random.shuffle(idx)\n", | |
"\n", | |
"idxs = idx[:round(len(idx)*FRAC)] # Select random 80% for training data\n", | |
"partial_x_train = train_data[idxs]\n", | |
"partial_y_train = y_train[idxs]\n", | |
"\n", | |
"x_val = np.delete(train_data, idxs.tolist(), axis=0) # select remaining as cross validation data\n", | |
"y_val = np.delete(y_train, idxs.tolist(), axis=0)\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "kUIF0Od4Uk0m", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"#i_epochs=40\n", | |
"i_epochs=2" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "eCIM2-FpOCWi", | |
"colab_type": "code", | |
"outputId": "1989e11a-4bbb-4219-f557-417663e3ab3e", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 105 | |
} | |
}, | |
"source": [ | |
"history = model.fit(partial_x_train,\n", | |
" partial_y_train,\n", | |
" epochs=i_epochs,\n", | |
" batch_size=512,\n", | |
" validation_data=(x_val, y_val),\n", | |
" verbose=1)" | |
], | |
"execution_count": 32, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Train on 12735 samples, validate on 3184 samples\n", | |
"Epoch 1/2\n", | |
"12735/12735 [==============================] - 165s 13ms/sample - loss: 0.6895 - acc: 0.5335 - val_loss: 0.6649 - val_acc: 0.6002\n", | |
"Epoch 2/2\n", | |
"12735/12735 [==============================] - 162s 13ms/sample - loss: 0.5804 - acc: 0.7432 - val_loss: 0.4599 - val_acc: 0.7827\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "PesiNzwzOGRT", | |
"colab_type": "code", | |
"outputId": "f4941f58-d835-4720-b496-fa04a6ae1b0d", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"results = model.evaluate(test_data, y_test)" | |
], | |
"execution_count": 33, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"7841/7841 [==============================] - 35s 4ms/sample - loss: 0.4613 - acc: 0.7831\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "KypfpQ-LOKVJ", | |
"colab_type": "code", | |
"outputId": "9bb2803b-e3e5-455f-966f-ab2273471c3d", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"print(model.metrics_names)" | |
], | |
"execution_count": 34, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"['loss', 'acc']\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "oxvFxS68OL3i", | |
"colab_type": "code", | |
"outputId": "c89da437-1b56-49c1-9285-c7f4a1e37ebb", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"history_dict = history.history\n", | |
"history_dict.keys()" | |
], | |
"execution_count": 35, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"dict_keys(['loss', 'acc', 'val_loss', 'val_acc'])" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 35 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "qqHexpFkR8E2", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## Prediction on test dataset\n", | |
"\n", | |
"Lets create a confusion matrix to see how the model perform with respect to each review type." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "6nJxJf3SZOIh", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"predicted = model.predict(test_data)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "7M2MGDX1R9oP", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"predicted[predicted > 0.5] = 1\n", | |
"predicted[predicted <= 0.5] = 0\n", | |
"predictedf = predicted.flatten().astype(int)\n", | |
"\n", | |
"#import pandas as pd\n", | |
"df3 = pd.DataFrame(data=predictedf, columns=['predicted'])\n", | |
"refdf = pd.DataFrame(data=y_test, columns=['actual'])\n", | |
"\n", | |
"y_actu = pd.Series(refdf['actual'], name='ACTUAL')\n", | |
"y_pred = pd.Series(df3['predicted'], name='PREDICTED')\n", | |
"predicted_results = y_pred.tolist()\n", | |
"truth = y_actu.tolist()\n", | |
"\n", | |
"dl_confusion = pd.crosstab(y_actu, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "0R7mHPtHSGht", | |
"colab_type": "code", | |
"outputId": "038f3dfb-1e16-4d9f-fc5d-d1fd743fac22", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 167 | |
} | |
}, | |
"source": [ | |
"dl_confusion" | |
], | |
"execution_count": 38, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th>Predicted</th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" <th>All</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Actual</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2881</td>\n", | |
" <td>1249</td>\n", | |
" <td>4130</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>452</td>\n", | |
" <td>3259</td>\n", | |
" <td>3711</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>All</th>\n", | |
" <td>3333</td>\n", | |
" <td>4508</td>\n", | |
" <td>7841</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
"Predicted 0 1 All\n", | |
"Actual \n", | |
"0 2881 1249 4130\n", | |
"1 452 3259 3711\n", | |
"All 3333 4508 7841" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 38 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "yTMupZb7SJSg", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Lets take a closer look at model performance for each type of review." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "_PH0mGLMSLxt", | |
"colab_type": "code", | |
"outputId": "2504a2b8-5e2f-460b-8c95-79ab0ac835cf", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 176 | |
} | |
}, | |
"source": [ | |
"\n", | |
"report = classification_report(truth, predicted_results)\n", | |
"print(report)" | |
], | |
"execution_count": 39, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
" precision recall f1-score support\n", | |
"\n", | |
" 0 0.86 0.70 0.77 4130\n", | |
" 1 0.72 0.88 0.79 3711\n", | |
"\n", | |
" accuracy 0.78 7841\n", | |
" macro avg 0.79 0.79 0.78 7841\n", | |
"weighted avg 0.80 0.78 0.78 7841\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "V2be2yxrZao3", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment