debarko/RNN_Predict.ipynb

## RNN_Predict.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "from keras.utils.data_utils import get_file\n",
    "from keras.datasets import imdb\n",
    "from keras.preprocessing import sequence\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Dropout, Activation, Flatten\n",
    "from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional\n",
    "from keras.layers import SimpleRNN, TimeDistributed\n",
    "from keras.layers.normalization import BatchNormalization\n",
    "from keras.utils import np_utils\n",
    "from keras.layers import Conv1D, MaxPooling1D, ZeroPadding1D\n",
    "from keras.utils import np_utils\n",
    "from keras.optimizers import Adam\n",
    "import pickle\n",
    "#import bcolz\n",
    "import re\n",
    "from numpy.random import random, permutation, randn, normal, uniform, choice"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "path = get_file('nietzsche.txt', origin=\"https://s3.amazonaws.com/text-datasets/nietzsche.txt\")\n",
    "text = open(path).read()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Creating a vocabulary of unique characters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "85\n"
     ]
    }
   ],
   "source": [
    "chars = sorted(list(set(text)))\n",
    "print(len(chars)+1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Inserting 0 as it wasn't in the original text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "chars.insert(0, '\\0')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Creating a dictionary, mapping characters to index and index to characters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "char_to_index = {v:i for i,v in enumerate(chars)}\n",
    "index_to_char = {i:v for i,v in enumerate(chars)}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Converting the entire nietzsche text into index of characters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "total_index = [char_to_index[char] for char in text]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "total_index[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'PREFACE\\n\\n\\nSUPPOSING that '"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "''.join(index_to_char[i] for i in total_index[:25])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As were are predicting the 8th character, we need to create an array of the first 7 characters each acting as an input and the last character as the output.\n",
    "\n",
    "For example, for the text 'this and that'\n",
    "\n",
    "The input will be -> [['t', ' '], ['h', 't'], ['i', 'h'], ['s', 'a'], [' ', 't'], ['a'], ['n']] -> but instead of the characters, there will be the index of the character.\n",
    "\n",
    "And the output will be -> ['d']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pred_num = 25\n",
    "xin = [[total_index[j+i] for j in range(0, len(total_index)-1-pred_num, pred_num)] for i in range(pred_num)]\n",
    "y = [total_index[i+pred_num] for i in range(0, len(total_index)-1-pred_num, pred_num)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We are removing the last 2 characters to keep the length of each array equal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X = [np.stack(xin[i][:-2]) for i in range(pred_num)]\n",
    "Y = np.stack(y[:-2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([40, 44, 58, ..., 58, 57, 54]),\n",
       " array([42, 71, 67, ...,  2, 57, 67]),\n",
       " array([29, 74, 24, ..., 62, 61, 56]),\n",
       " array([30, 73,  2, ..., 67, 62, 73]),\n",
       " array([25, 61, 33, ..., 72, 72, 62]),\n",
       " array([27,  2, 72, ..., 62, 73, 73]),\n",
       " array([29, 62,  2, ..., 72, 72, 78]),\n",
       " array([ 1, 72, 73, ..., 73,  2,  8]),\n",
       " array([ 1,  2, 61, ..., 58, 54,  2]),\n",
       " array([ 1, 54, 58, ..., 57, 72, 63]),\n",
       " array([43,  2, 71, ...,  2,  2, 74]),\n",
       " array([45, 76, 58, ..., 74, 58, 72]),\n",
       " array([40, 68,  2, ..., 69, 72, 73]),\n",
       " array([40, 66, 67, ..., 68, 72,  2]),\n",
       " array([39, 54, 68, ..., 67, 58, 54]),\n",
       " array([43, 67, 73, ...,  2, 67, 72]),\n",
       " array([33,  9,  2, ..., 55, 73,  2]),\n",
       " array([38,  9, 60, ..., 78, 62, 73]),\n",
       " array([31, 76, 71, ...,  2, 54, 61]),\n",
       " array([ 2, 61, 68, ..., 73, 65, 58]),\n",
       " array([73, 54, 74, ..., 61,  2, 78]),\n",
       " array([61, 73, 67, ..., 58, 73,  2]),\n",
       " array([54,  2, 57, ...,  1, 68, 76]),\n",
       " array([73, 73,  1, ..., 26,  2, 58]),\n",
       " array([ 2, 61, 59, ..., 74, 72, 71])]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([44, 58, 68, 62, 73,  8, 67, 65])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y[:8]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((24033,), (24033,))"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X[0].shape, Y.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Return Sequences"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we will predict the next word where the input will be all the words before it.\n",
    "\n",
    "For example, to predict the 2nd word, first word will be used\n",
    "\n",
    "To predict the 3rd word, first and second word will be used and so on."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "ys = [[total_index[j+i] for j in range(1, len(total_index)-pred_num, pred_num)] for i in range(pred_num)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "Y_return = [np.stack(ys[i][:-2]) for i in range(pred_num)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([40, 44, 58, ..., 58, 57, 54]),\n",
       " array([42, 71, 67, ...,  2, 57, 67]),\n",
       " array([29, 74, 24, ..., 62, 61, 56]),\n",
       " array([30, 73,  2, ..., 67, 62, 73]),\n",
       " array([25, 61, 33, ..., 72, 72, 62]),\n",
       " array([27,  2, 72, ..., 62, 73, 73]),\n",
       " array([29, 62,  2, ..., 72, 72, 78]),\n",
       " array([ 1, 72, 73, ..., 73,  2,  8]),\n",
       " array([ 1,  2, 61, ..., 58, 54,  2]),\n",
       " array([ 1, 54, 58, ..., 57, 72, 63]),\n",
       " array([43,  2, 71, ...,  2,  2, 74]),\n",
       " array([45, 76, 58, ..., 74, 58, 72]),\n",
       " array([40, 68,  2, ..., 69, 72, 73]),\n",
       " array([40, 66, 67, ..., 68, 72,  2]),\n",
       " array([39, 54, 68, ..., 67, 58, 54]),\n",
       " array([43, 67, 73, ...,  2, 67, 72]),\n",
       " array([33,  9,  2, ..., 55, 73,  2]),\n",
       " array([38,  9, 60, ..., 78, 62, 73]),\n",
       " array([31, 76, 71, ...,  2, 54, 61]),\n",
       " array([ 2, 61, 68, ..., 73, 65, 58]),\n",
       " array([73, 54, 74, ..., 61,  2, 78]),\n",
       " array([61, 73, 67, ..., 58, 73,  2]),\n",
       " array([54,  2, 57, ...,  1, 68, 76]),\n",
       " array([73, 73,  1, ..., 26,  2, 58]),\n",
       " array([ 2, 61, 59, ..., 74, 72, 71])]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([42, 71, 67, ...,  2, 57, 67]),\n",
       " array([29, 74, 24, ..., 62, 61, 56]),\n",
       " array([30, 73,  2, ..., 67, 62, 73]),\n",
       " array([25, 61, 33, ..., 72, 72, 62]),\n",
       " array([27,  2, 72, ..., 62, 73, 73]),\n",
       " array([29, 62,  2, ..., 72, 72, 78]),\n",
       " array([ 1, 72, 73, ..., 73,  2,  8]),\n",
       " array([ 1,  2, 61, ..., 58, 54,  2]),\n",
       " array([ 1, 54, 58, ..., 57, 72, 63]),\n",
       " array([43,  2, 71, ...,  2,  2, 74]),\n",
       " array([45, 76, 58, ..., 74, 58, 72]),\n",
       " array([40, 68,  2, ..., 69, 72, 73]),\n",
       " array([40, 66, 67, ..., 68, 72,  2]),\n",
       " array([39, 54, 68, ..., 67, 58, 54]),\n",
       " array([43, 67, 73, ...,  2, 67, 72]),\n",
       " array([33,  9,  2, ..., 55, 73,  2]),\n",
       " array([38,  9, 60, ..., 78, 62, 73]),\n",
       " array([31, 76, 71, ...,  2, 54, 61]),\n",
       " array([ 2, 61, 68, ..., 73, 65, 58]),\n",
       " array([73, 54, 74, ..., 61,  2, 78]),\n",
       " array([61, 73, 67, ..., 58, 73,  2]),\n",
       " array([54,  2, 57, ...,  1, 68, 76]),\n",
       " array([73, 73,  1, ..., 26,  2, 58]),\n",
       " array([ 2, 61, 59, ..., 74, 72, 71]),\n",
       " array([44, 58, 68, ..., 57, 54, 58])]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y_return"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "vocab_size = 86\n",
    "n_fac = 42\n",
    "hidden_layers = 256"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As we are setting return_sequences=True, we need to wrap the Dense layer in a TimeDistributed Layer since it is a sequence."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "return_model = Sequential([\n",
    "        Embedding(vocab_size, n_fac, input_length=pred_num),\n",
    "        SimpleRNN(hidden_layers, return_sequences=True, activation='relu'),\n",
    "        TimeDistributed(Dense(vocab_size, activation='softmax'))\n",
    "    ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "embedding_1 (Embedding)      (None, 25, 42)            3612      \n",
      "_________________________________________________________________\n",
      "simple_rnn_1 (SimpleRNN)     (None, 25, 256)           76544     \n",
      "_________________________________________________________________\n",
      "time_distributed_1 (TimeDist (None, 25, 86)            22102     \n",
      "=================================================================\n",
      "Total params: 102,258\n",
      "Trainable params: 102,258\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "return_model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "return_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X_model = np.stack(X, 1)\n",
    "Y_model = np.expand_dims(np.stack(Y_return, 1), axis=-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/5\n",
      "24033/24033 [==============================] - 25s 1ms/step - loss: 2.7342\n",
      "Epoch 2/5\n",
      "24033/24033 [==============================] - 25s 1ms/step - loss: 2.1083\n",
      "Epoch 3/5\n",
      "24033/24033 [==============================] - 25s 1ms/step - loss: 1.9187\n",
      "Epoch 4/5\n",
      "24033/24033 [==============================] - 25s 1ms/step - loss: 1.8055\n",
      "Epoch 5/5\n",
      "24033/24033 [==============================] - 25s 1ms/step - loss: 1.7336\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f2bdc528668>"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "return_model.fit(X_model, Y_model, batch_size=64, epochs=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/5\n",
      "24033/24033 [==============================] - 25s 1ms/step - loss: 1.6826\n",
      "Epoch 2/5\n",
      "24033/24033 [==============================] - 25s 1ms/step - loss: 1.6446\n",
      "Epoch 3/5\n",
      "24033/24033 [==============================] - 25s 1ms/step - loss: 1.6154\n",
      "Epoch 4/5\n",
      "24033/24033 [==============================] - 25s 1ms/step - loss: 1.5921\n",
      "Epoch 5/5\n",
      "24033/24033 [==============================] - 25s 1ms/step - loss: 1.5724\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f2bd6ca3b38>"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "return_model.optimizer.lr = 1e-4\n",
    "return_model.fit(X_model, Y_model, batch_size=64, epochs=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/5\n",
      "24033/24033 [==============================] - 25s 1ms/step - loss: 1.5562\n",
      "Epoch 2/5\n",
      "24033/24033 [==============================] - 25s 1ms/step - loss: 1.5415\n",
      "Epoch 3/5\n",
      "24033/24033 [==============================] - 25s 1ms/step - loss: 1.5291\n",
      "Epoch 4/5\n",
      "24033/24033 [==============================] - 25s 1ms/step - loss: 1.5185\n",
      "Epoch 5/5\n",
      "24033/24033 [==============================] - 25s 1ms/step - loss: 1.5089\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f2bd6ca3da0>"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "return_model.optimizer.lr = 1e-4\n",
    "return_model.fit(X_model, Y_model, batch_size=64, epochs=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "return_model.save_weights('return_sequences_25.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def predict_every_char(inp):\n",
    "    l = []\n",
    "    p = 0\n",
    "    while p<len(inp):\n",
    "        pre_inp = inp[p:p+pred_num]\n",
    "        if len(pre_inp) < pred_num:\n",
    "            pre_inp = pre_inp + ' '*(pred_num - len(pre_inp))\n",
    "            l.append(pre_inp)\n",
    "        else:\n",
    "            l.append(pre_inp) \n",
    "        p+=pred_num\n",
    "\n",
    "#     index = [char_to_index[i] for i in inp]\n",
    "#     arr = np.expand_dims(index, axis=0)\n",
    "#     prediction = return_model.predict(arr)\n",
    "#     return ''.join([index_to_char[np.argmax(i)] for i in prediction[0]])\n",
    "    \n",
    "    final = []\n",
    "    for half in l:\n",
    "        index = [char_to_index[i] for i in half]\n",
    "        arr = np.expand_dims(index, axis=0)\n",
    "        prediction = return_model.predict(arr)\n",
    "        final.append(''.join([index_to_char[np.argmax(i)] for i in prediction[0]]))\n",
    "    \n",
    "    return ''.join(final)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'nd the sedsiiat tt       '"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predict_every_char('and the boy left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'hen ss tt                '"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predict_every_char('this is')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'4]\\nbGter tiseng tiscovere  tn tony tf the soas aonelesent vle tntuons oanelton feststion  of traasure is tvptionsoor tts pwn toye  t milty t man boceroitn the sanf-aonsrmpl tiesh taanacter sts oiwf ar   la  tnd tlso tn the r tntu tf tulf-ah sure taheeush timgrr and cuinnseng   aostrrison  ond toarnsng tf the sogel  tncu af toneers  otmely tnfoans oiine y ahch aoture  oan be sstsahe srneral txpopsthon of the r sill to tive aahe r sowves   The  avpeer ioe sost srin ul avpe  cdc  ah txtepa on tney tor t mhme toom the siaryngss ond th kingss on thinh ahe  are sornner ay the r treat man ol tn icidto ond the r socjection oons shll nfher thet the r tfn  I                   '"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predict_every_char(\"140 After having discovered in many of the less comprehensible actions mere manifestations of pleasure in emotion for its own sake, I fancy I can detect in the self contempt which characterises holy persons, and also in their acts of self torture (through hunger and scourgings, distortions and chaining of the limbs, acts of madness) simply a means whereby such natures may resist the general exhaustion of their will to live (their nerves). They employ the most painful expedients to escape if only for a time from the heaviness and weariness in which they are steeped by their great mental indolence and their subjection to a will other than their own.\")"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"Using TensorFlow backend.\n"
	]
	}
	],
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"import matplotlib.pyplot as plt\n",
	"%matplotlib inline\n",
	"from keras.utils.data_utils import get_file\n",
	"from keras.datasets import imdb\n",
	"from keras.preprocessing import sequence\n",
	"from keras.models import Sequential\n",
	"from keras.layers import Dense, Dropout, Activation, Flatten\n",
	"from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional\n",
	"from keras.layers import SimpleRNN, TimeDistributed\n",
	"from keras.layers.normalization import BatchNormalization\n",
	"from keras.utils import np_utils\n",
	"from keras.layers import Conv1D, MaxPooling1D, ZeroPadding1D\n",
	"from keras.utils import np_utils\n",
	"from keras.optimizers import Adam\n",
	"import pickle\n",
	"#import bcolz\n",
	"import re\n",
	"from numpy.random import random, permutation, randn, normal, uniform, choice"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"path = get_file('nietzsche.txt', origin=\"https://s3.amazonaws.com/text-datasets/nietzsche.txt\")\n",
	"text = open(path).read()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Creating a vocabulary of unique characters"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false,
	"scrolled": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"85\n"
	]
	}
	],
	"source": [
	"chars = sorted(list(set(text)))\n",
	"print(len(chars)+1)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Inserting 0 as it wasn't in the original text"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"chars.insert(0, '\\0')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Creating a dictionary, mapping characters to index and index to characters"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"char_to_index = {v:i for i,v in enumerate(chars)}\n",
	"index_to_char = {i:v for i,v in enumerate(chars)}"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Converting the entire nietzsche text into index of characters"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"total_index = [char_to_index[char] for char in text]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"total_index[:10]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'PREFACE\\n\\n\\nSUPPOSING that '"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"''.join(index_to_char[i] for i in total_index[:25])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"As were are predicting the 8th character, we need to create an array of the first 7 characters each acting as an input and the last character as the output.\n",
	"\n",
	"For example, for the text 'this and that'\n",
	"\n",
	"The input will be -> [['t', ' '], ['h', 't'], ['i', 'h'], ['s', 'a'], [' ', 't'], ['a'], ['n']] -> but instead of the characters, there will be the index of the character.\n",
	"\n",
	"And the output will be -> ['d']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"pred_num = 25\n",
	"xin = [[total_index[j+i] for j in range(0, len(total_index)-1-pred_num, pred_num)] for i in range(pred_num)]\n",
	"y = [total_index[i+pred_num] for i in range(0, len(total_index)-1-pred_num, pred_num)]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"We are removing the last 2 characters to keep the length of each array equal"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"X = [np.stack(xin[i][:-2]) for i in range(pred_num)]\n",
	"Y = np.stack(y[:-2])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[array([40, 44, 58, ..., 58, 57, 54]),\n",
	" array([42, 71, 67, ..., 2, 57, 67]),\n",
	" array([29, 74, 24, ..., 62, 61, 56]),\n",
	" array([30, 73, 2, ..., 67, 62, 73]),\n",
	" array([25, 61, 33, ..., 72, 72, 62]),\n",
	" array([27, 2, 72, ..., 62, 73, 73]),\n",
	" array([29, 62, 2, ..., 72, 72, 78]),\n",
	" array([ 1, 72, 73, ..., 73, 2, 8]),\n",
	" array([ 1, 2, 61, ..., 58, 54, 2]),\n",
	" array([ 1, 54, 58, ..., 57, 72, 63]),\n",
	" array([43, 2, 71, ..., 2, 2, 74]),\n",
	" array([45, 76, 58, ..., 74, 58, 72]),\n",
	" array([40, 68, 2, ..., 69, 72, 73]),\n",
	" array([40, 66, 67, ..., 68, 72, 2]),\n",
	" array([39, 54, 68, ..., 67, 58, 54]),\n",
	" array([43, 67, 73, ..., 2, 67, 72]),\n",
	" array([33, 9, 2, ..., 55, 73, 2]),\n",
	" array([38, 9, 60, ..., 78, 62, 73]),\n",
	" array([31, 76, 71, ..., 2, 54, 61]),\n",
	" array([ 2, 61, 68, ..., 73, 65, 58]),\n",
	" array([73, 54, 74, ..., 61, 2, 78]),\n",
	" array([61, 73, 67, ..., 58, 73, 2]),\n",
	" array([54, 2, 57, ..., 1, 68, 76]),\n",
	" array([73, 73, 1, ..., 26, 2, 58]),\n",
	" array([ 2, 61, 59, ..., 74, 72, 71])]"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"X"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([44, 58, 68, 62, 73, 8, 67, 65])"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"Y[:8]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"((24033,), (24033,))"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"X[0].shape, Y.shape"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Return Sequences"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Here we will predict the next word where the input will be all the words before it.\n",
	"\n",
	"For example, to predict the 2nd word, first word will be used\n",
	"\n",
	"To predict the 3rd word, first and second word will be used and so on."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"ys = [[total_index[j+i] for j in range(1, len(total_index)-pred_num, pred_num)] for i in range(pred_num)]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"Y_return = [np.stack(ys[i][:-2]) for i in range(pred_num)]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[array([40, 44, 58, ..., 58, 57, 54]),\n",
	" array([42, 71, 67, ..., 2, 57, 67]),\n",
	" array([29, 74, 24, ..., 62, 61, 56]),\n",
	" array([30, 73, 2, ..., 67, 62, 73]),\n",
	" array([25, 61, 33, ..., 72, 72, 62]),\n",
	" array([27, 2, 72, ..., 62, 73, 73]),\n",
	" array([29, 62, 2, ..., 72, 72, 78]),\n",
	" array([ 1, 72, 73, ..., 73, 2, 8]),\n",
	" array([ 1, 2, 61, ..., 58, 54, 2]),\n",
	" array([ 1, 54, 58, ..., 57, 72, 63]),\n",
	" array([43, 2, 71, ..., 2, 2, 74]),\n",
	" array([45, 76, 58, ..., 74, 58, 72]),\n",
	" array([40, 68, 2, ..., 69, 72, 73]),\n",
	" array([40, 66, 67, ..., 68, 72, 2]),\n",
	" array([39, 54, 68, ..., 67, 58, 54]),\n",
	" array([43, 67, 73, ..., 2, 67, 72]),\n",
	" array([33, 9, 2, ..., 55, 73, 2]),\n",
	" array([38, 9, 60, ..., 78, 62, 73]),\n",
	" array([31, 76, 71, ..., 2, 54, 61]),\n",
	" array([ 2, 61, 68, ..., 73, 65, 58]),\n",
	" array([73, 54, 74, ..., 61, 2, 78]),\n",
	" array([61, 73, 67, ..., 58, 73, 2]),\n",
	" array([54, 2, 57, ..., 1, 68, 76]),\n",
	" array([73, 73, 1, ..., 26, 2, 58]),\n",
	" array([ 2, 61, 59, ..., 74, 72, 71])]"
	]
	},
	"execution_count": 16,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"X"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[array([42, 71, 67, ..., 2, 57, 67]),\n",
	" array([29, 74, 24, ..., 62, 61, 56]),\n",
	" array([30, 73, 2, ..., 67, 62, 73]),\n",
	" array([25, 61, 33, ..., 72, 72, 62]),\n",
	" array([27, 2, 72, ..., 62, 73, 73]),\n",
	" array([29, 62, 2, ..., 72, 72, 78]),\n",
	" array([ 1, 72, 73, ..., 73, 2, 8]),\n",
	" array([ 1, 2, 61, ..., 58, 54, 2]),\n",
	" array([ 1, 54, 58, ..., 57, 72, 63]),\n",
	" array([43, 2, 71, ..., 2, 2, 74]),\n",
	" array([45, 76, 58, ..., 74, 58, 72]),\n",
	" array([40, 68, 2, ..., 69, 72, 73]),\n",
	" array([40, 66, 67, ..., 68, 72, 2]),\n",
	" array([39, 54, 68, ..., 67, 58, 54]),\n",
	" array([43, 67, 73, ..., 2, 67, 72]),\n",
	" array([33, 9, 2, ..., 55, 73, 2]),\n",
	" array([38, 9, 60, ..., 78, 62, 73]),\n",
	" array([31, 76, 71, ..., 2, 54, 61]),\n",
	" array([ 2, 61, 68, ..., 73, 65, 58]),\n",
	" array([73, 54, 74, ..., 61, 2, 78]),\n",
	" array([61, 73, 67, ..., 58, 73, 2]),\n",
	" array([54, 2, 57, ..., 1, 68, 76]),\n",
	" array([73, 73, 1, ..., 26, 2, 58]),\n",
	" array([ 2, 61, 59, ..., 74, 72, 71]),\n",
	" array([44, 58, 68, ..., 57, 54, 58])]"
	]
	},
	"execution_count": 17,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"Y_return"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"vocab_size = 86\n",
	"n_fac = 42\n",
	"hidden_layers = 256"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"As we are setting return_sequences=True, we need to wrap the Dense layer in a TimeDistributed Layer since it is a sequence."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"return_model = Sequential([\n",
	" Embedding(vocab_size, n_fac, input_length=pred_num),\n",
	" SimpleRNN(hidden_layers, return_sequences=True, activation='relu'),\n",
	" TimeDistributed(Dense(vocab_size, activation='softmax'))\n",
	" ])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"_________________________________________________________________\n",
	"Layer (type) Output Shape Param # \n",
	"=================================================================\n",
	"embedding_1 (Embedding) (None, 25, 42) 3612 \n",
	"_________________________________________________________________\n",
	"simple_rnn_1 (SimpleRNN) (None, 25, 256) 76544 \n",
	"_________________________________________________________________\n",
	"time_distributed_1 (TimeDist (None, 25, 86) 22102 \n",
	"=================================================================\n",
	"Total params: 102,258\n",
	"Trainable params: 102,258\n",
	"Non-trainable params: 0\n",
	"_________________________________________________________________\n"
	]
	}
	],
	"source": [
	"return_model.summary()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"return_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"X_model = np.stack(X, 1)\n",
	"Y_model = np.expand_dims(np.stack(Y_return, 1), axis=-1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Epoch 1/5\n",
	"24033/24033 [==============================] - 25s 1ms/step - loss: 2.7342\n",
	"Epoch 2/5\n",
	"24033/24033 [==============================] - 25s 1ms/step - loss: 2.1083\n",
	"Epoch 3/5\n",
	"24033/24033 [==============================] - 25s 1ms/step - loss: 1.9187\n",
	"Epoch 4/5\n",
	"24033/24033 [==============================] - 25s 1ms/step - loss: 1.8055\n",
	"Epoch 5/5\n",
	"24033/24033 [==============================] - 25s 1ms/step - loss: 1.7336\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"<keras.callbacks.History at 0x7f2bdc528668>"
	]
	},
	"execution_count": 23,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"return_model.fit(X_model, Y_model, batch_size=64, epochs=5)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Epoch 1/5\n",
	"24033/24033 [==============================] - 25s 1ms/step - loss: 1.6826\n",
	"Epoch 2/5\n",
	"24033/24033 [==============================] - 25s 1ms/step - loss: 1.6446\n",
	"Epoch 3/5\n",
	"24033/24033 [==============================] - 25s 1ms/step - loss: 1.6154\n",
	"Epoch 4/5\n",
	"24033/24033 [==============================] - 25s 1ms/step - loss: 1.5921\n",
	"Epoch 5/5\n",
	"24033/24033 [==============================] - 25s 1ms/step - loss: 1.5724\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"<keras.callbacks.History at 0x7f2bd6ca3b38>"
	]
	},
	"execution_count": 24,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"return_model.optimizer.lr = 1e-4\n",
	"return_model.fit(X_model, Y_model, batch_size=64, epochs=5)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Epoch 1/5\n",
	"24033/24033 [==============================] - 25s 1ms/step - loss: 1.5562\n",
	"Epoch 2/5\n",
	"24033/24033 [==============================] - 25s 1ms/step - loss: 1.5415\n",
	"Epoch 3/5\n",
	"24033/24033 [==============================] - 25s 1ms/step - loss: 1.5291\n",
	"Epoch 4/5\n",
	"24033/24033 [==============================] - 25s 1ms/step - loss: 1.5185\n",
	"Epoch 5/5\n",
	"24033/24033 [==============================] - 25s 1ms/step - loss: 1.5089\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"<keras.callbacks.History at 0x7f2bd6ca3da0>"
	]
	},
	"execution_count": 25,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"return_model.optimizer.lr = 1e-4\n",
	"return_model.fit(X_model, Y_model, batch_size=64, epochs=5)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"return_model.save_weights('return_sequences_25.h5')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def predict_every_char(inp):\n",
	" l = []\n",
	" p = 0\n",
	" while p<len(inp):\n",
	" pre_inp = inp[p:p+pred_num]\n",
	" if len(pre_inp) < pred_num:\n",
	" pre_inp = pre_inp + ' '*(pred_num - len(pre_inp))\n",
	" l.append(pre_inp)\n",
	" else:\n",
	" l.append(pre_inp) \n",
	" p+=pred_num\n",
	"\n",
	"# index = [char_to_index[i] for i in inp]\n",
	"# arr = np.expand_dims(index, axis=0)\n",
	"# prediction = return_model.predict(arr)\n",
	"# return ''.join([index_to_char[np.argmax(i)] for i in prediction[0]])\n",
	" \n",
	" final = []\n",
	" for half in l:\n",
	" index = [char_to_index[i] for i in half]\n",
	" arr = np.expand_dims(index, axis=0)\n",
	" prediction = return_model.predict(arr)\n",
	" final.append(''.join([index_to_char[np.argmax(i)] for i in prediction[0]]))\n",
	" \n",
	" return ''.join(final)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'nd the sedsiiat tt '"
	]
	},
	"execution_count": 28,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"predict_every_char('and the boy left')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'hen ss tt '"
	]
	},
	"execution_count": 29,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"predict_every_char('this is')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'4]\\nbGter tiseng tiscovere tn tony tf the soas aonelesent vle tntuons oanelton feststion of traasure is tvptionsoor tts pwn toye t milty t man boceroitn the sanf-aonsrmpl tiesh taanacter sts oiwf ar la tnd tlso tn the r tntu tf tulf-ah sure taheeush timgrr and cuinnseng aostrrison ond toarnsng tf the sogel tncu af toneers otmely tnfoans oiine y ahch aoture oan be sstsahe srneral txpopsthon of the r sill to tive aahe r sowves The avpeer ioe sost srin ul avpe cdc ah txtepa on tney tor t mhme toom the siaryngss ond th kingss on thinh ahe are sornner ay the r treat man ol tn icidto ond the r socjection oons shll nfher thet the r tfn I '"
	]
	},
	"execution_count": 30,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"predict_every_char(\"140 After having discovered in many of the less comprehensible actions mere manifestations of pleasure in emotion for its own sake, I fancy I can detect in the self contempt which characterises holy persons, and also in their acts of self torture (through hunger and scourgings, distortions and chaining of the limbs, acts of madness) simply a means whereby such natures may resist the general exhaustion of their will to live (their nerves). They employ the most painful expedients to escape if only for a time from the heaviness and weariness in which they are steeped by their great mental indolence and their subjection to a will other than their own.\")"
	]
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}