euphoris/class10.ipynb

## class10.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "practice10.ipynb",
      "version": "0.3.2",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "metadata": {
        "id": "pzozBwWQiedL",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "76a21baa-345e-4550-91c1-ee72af6dd22c"
      },
      "cell_type": "code",
      "source": [
        "import keras\n",
        "import pandas as pd"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Using TensorFlow backend.\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "metadata": {
        "id": "bvN-e4i1kQ5i",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "데이터 파일을 다운로드"
      ]
    },
    {
      "metadata": {
        "id": "I77on9FQip4U",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 214
        },
        "outputId": "6ed7e71a-ae46-46c7-f2a7-ffe9ae17bb1c"
      },
      "cell_type": "code",
      "source": [
        "!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip"
      ],
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "--2018-11-03 12:01:54--  https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip\n",
            "Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.249\n",
            "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.249|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 84188 (82K) [application/zip]\n",
            "Saving to: ‘sentiment labelled sentences.zip’\n",
            "\n",
            "\r          sentiment   0%[                    ]       0  --.-KB/s               \r         sentiment   29%[====>               ]  24.00K  84.9KB/s               \rsentiment labelled  100%[===================>]  82.21K   194KB/s    in 0.4s    \n",
            "\n",
            "2018-11-03 12:01:55 (194 KB/s) - ‘sentiment labelled sentences.zip’ saved [84188/84188]\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "id": "MuTrA0Z-kS7i",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "압축을 푼다"
      ]
    },
    {
      "metadata": {
        "id": "8b3vQJVVi7_V",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 249
        },
        "outputId": "fc8f3b71-6274-49c1-ceac-49b4134d491b"
      },
      "cell_type": "code",
      "source": [
        "!unzip sentiment\\ labelled\\ sentences.zip"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Archive:  sentiment labelled sentences.zip\n",
            "   creating: sentiment labelled sentences/\n",
            "  inflating: sentiment labelled sentences/.DS_Store  \n",
            "   creating: __MACOSX/\n",
            "   creating: __MACOSX/sentiment labelled sentences/\n",
            "  inflating: __MACOSX/sentiment labelled sentences/._.DS_Store  \n",
            "  inflating: sentiment labelled sentences/amazon_cells_labelled.txt  \n",
            "  inflating: sentiment labelled sentences/imdb_labelled.txt  \n",
            "  inflating: __MACOSX/sentiment labelled sentences/._imdb_labelled.txt  \n",
            "  inflating: sentiment labelled sentences/readme.txt  \n",
            "  inflating: __MACOSX/sentiment labelled sentences/._readme.txt  \n",
            "  inflating: sentiment labelled sentences/yelp_labelled.txt  \n",
            "  inflating: __MACOSX/._sentiment labelled sentences  \n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "id": "UiTZPNR8jOUb",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "df = pd.read_csv('sentiment labelled sentences/amazon_cells_labelled.txt',\n",
        "                 sep='\\t',\n",
        "                 header=None)\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "H31jcemljQnI",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 206
        },
        "outputId": "3537e414-9e8c-4c65-aa03-cef346c7bb61"
      },
      "cell_type": "code",
      "source": [
        "df.head()"
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>0</th>\n",
              "      <th>1</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>So there is no way for me to plug it in here i...</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Good case, Excellent value.</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Great for the jawbone.</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Tied to charger for conversations lasting more...</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>The mic is great.</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                                   0  1\n",
              "0  So there is no way for me to plug it in here i...  0\n",
              "1                        Good case, Excellent value.  1\n",
              "2                             Great for the jawbone.  1\n",
              "3  Tied to charger for conversations lasting more...  0\n",
              "4                                  The mic is great.  1"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 5
        }
      ]
    },
    {
      "metadata": {
        "id": "EQvKro5pjZ1X",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "tok = keras.preprocessing.text.Tokenizer()\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "X2IFMgh_l3_X",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "tok.fit_on_texts(df[0])"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "rLxI_KAsl_pg",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "226d951e-a3dd-4cc7-a71b-c59099c1e608"
      },
      "cell_type": "code",
      "source": [
        "tok.word_index['plug']"
      ],
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "155"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 8
        }
      ]
    },
    {
      "metadata": {
        "id": "aTA1RSoWmESC",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "tok.index_word = {i: w for w, i in tok.word_index.items()}"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "I3OS5mTBmOuF",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "c8fa9528-627a-4edd-e71e-433281fd1757"
      },
      "cell_type": "code",
      "source": [
        "tok.index_word[200]"
      ],
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'broke'"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 10
        }
      ]
    },
    {
      "metadata": {
        "id": "fF2Vk_sqmglO",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "seq = tok.texts_to_sequences(df[0])"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "qgHbBcg5myfb",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "45255621-c6f3-4332-ba10-6d77405c499a"
      },
      "cell_type": "code",
      "source": [
        "df.loc[0,0]"
      ],
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'So there is no way for me to plug it in here in the US unless I go by a converter.'"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 12
        }
      ]
    },
    {
      "metadata": {
        "id": "X0casrGRm2tz",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 392
        },
        "outputId": "792eaa8c-9041-4929-b9e2-9c62e6321786"
      },
      "cell_type": "code",
      "source": [
        "seq[0]"
      ],
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[33,\n",
              " 117,\n",
              " 5,\n",
              " 53,\n",
              " 214,\n",
              " 11,\n",
              " 47,\n",
              " 8,\n",
              " 155,\n",
              " 4,\n",
              " 19,\n",
              " 337,\n",
              " 19,\n",
              " 1,\n",
              " 546,\n",
              " 416,\n",
              " 2,\n",
              " 241,\n",
              " 190,\n",
              " 6,\n",
              " 812]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 13
        }
      ]
    },
    {
      "metadata": {
        "id": "qYP29CHIm1RH",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "MAXLEN = max(len(s) for s in seq)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "nl-PVBbCn_AE",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "1dfd93f4-c3d9-40d9-829e-9c31ea6cd2da"
      },
      "cell_type": "code",
      "source": [
        "MAXLEN"
      ],
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "30"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 15
        }
      ]
    },
    {
      "metadata": {
        "id": "v1NpdswWn_6g",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "pad = keras.preprocessing.sequence.pad_sequences(seq, MAXLEN)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "ad5emDn-onGd",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 71
        },
        "outputId": "619252b1-42d7-4017-f387-5de200533ed8"
      },
      "cell_type": "code",
      "source": [
        "pad[0]"
      ],
      "execution_count": 17,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([  0,   0,   0,   0,   0,   0,   0,   0,   0,  33, 117,   5,  53,\n",
              "       214,  11,  47,   8, 155,   4,  19, 337,  19,   1, 546, 416,   2,\n",
              "       241, 190,   6, 812], dtype=int32)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 17
        }
      ]
    },
    {
      "metadata": {
        "id": "G39RNvJSooH6",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "from sklearn.model_selection import train_test_split\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "sAzRyOg_o_Zc",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "X_train, X_test, y_train, y_test = train_test_split(\n",
        "    pad, df[1], test_size=.2, random_state=1234)\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "1xr0mtT9pCh2",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "NUM_WORDS = len(tok.index_word) + 1"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "exAeqqpjtrsZ",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "d02a3316-976e-4c84-e983-3dc1273efb82"
      },
      "cell_type": "code",
      "source": [
        "NUM_WORDS"
      ],
      "execution_count": 21,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "1879"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 21
        }
      ]
    },
    {
      "metadata": {
        "id": "OnoccqO_tvGj",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "from keras.models import Sequential\n",
        "from keras.layers import Dense, Embedding, LSTM"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "2972S2uUtzL3",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "rnn = Sequential()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "DN75ai0Nt1vS",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "rnn.add(Embedding(input_dim=NUM_WORDS, \n",
        "                  output_dim=8, \n",
        "                  input_length=MAXLEN,\n",
        "                  mask_zero=True))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "WBNqePuduuKp",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "rnn.add(LSTM(16, return_sequences=False))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "6IsojJkRuxhi",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "rnn.add(Dense(1, activation='sigmoid'))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "-zH5_REwvyEF",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "from keras.optimizers import Adam"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "fR_LmYd8wBw6",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "rnn.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "V1N11tUTwHMP",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 392
        },
        "outputId": "48344325-8316-485e-cef9-b7432ad8aa6c"
      },
      "cell_type": "code",
      "source": [
        "rnn.fit(X_train, y_train, epochs=10)"
      ],
      "execution_count": 29,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Epoch 1/10\n",
            "800/800 [==============================] - 5s 6ms/step - loss: 0.6924 - acc: 0.5425\n",
            "Epoch 2/10\n",
            "800/800 [==============================] - 2s 3ms/step - loss: 0.6864 - acc: 0.7238\n",
            "Epoch 3/10\n",
            "800/800 [==============================] - 2s 3ms/step - loss: 0.6540 - acc: 0.8687\n",
            "Epoch 4/10\n",
            "800/800 [==============================] - 2s 3ms/step - loss: 0.5475 - acc: 0.8687\n",
            "Epoch 5/10\n",
            "800/800 [==============================] - 2s 3ms/step - loss: 0.4368 - acc: 0.9037\n",
            "Epoch 6/10\n",
            "800/800 [==============================] - 2s 3ms/step - loss: 0.3396 - acc: 0.9387\n",
            "Epoch 7/10\n",
            "800/800 [==============================] - 2s 3ms/step - loss: 0.2856 - acc: 0.9525\n",
            "Epoch 8/10\n",
            "800/800 [==============================] - 2s 3ms/step - loss: 0.2335 - acc: 0.9688\n",
            "Epoch 9/10\n",
            "800/800 [==============================] - 2s 3ms/step - loss: 0.2052 - acc: 0.9700\n",
            "Epoch 10/10\n",
            "800/800 [==============================] - 2s 3ms/step - loss: 0.1739 - acc: 0.9725\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<keras.callbacks.History at 0x7f27c541d240>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 29
        }
      ]
    },
    {
      "metadata": {
        "id": "l1kjZgp1wKO5",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "y_rnn = rnn.predict_classes(X_test)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "1hkqxLSPw3OB",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 71
        },
        "outputId": "6562b179-69da-4d56-c3d5-2bd16be3b5cc"
      },
      "cell_type": "code",
      "source": [
        "X_test[0]"
      ],
      "execution_count": 31,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
              "          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
              "          0,    0,    0,  136,  306, 1525,    5,  415], dtype=int32)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 31
        }
      ]
    },
    {
      "metadata": {
        "id": "Q86wV9Muw5r5",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "f280e63e-1e4a-468a-a2f4-2e37c7dcd204"
      },
      "cell_type": "code",
      "source": [
        "tok.index_word[415]"
      ],
      "execution_count": 32,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'awful'"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 32
        }
      ]
    },
    {
      "metadata": {
        "id": "XM7hFj3ow-U0",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "a23c5e73-4e7a-4ee8-d543-6b482ab18e29"
      },
      "cell_type": "code",
      "source": [
        "y_rnn[0]"
      ],
      "execution_count": 33,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([1], dtype=int32)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 33
        }
      ]
    },
    {
      "metadata": {
        "id": "EbS9rrGHxHRU",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "from sklearn.metrics import accuracy_score"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "YrGa7qMZxNqI",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "12d38c4b-870e-40d5-c0eb-71155ab010ed"
      },
      "cell_type": "code",
      "source": [
        "accuracy_score(y_test, y_rnn)\n"
      ],
      "execution_count": 36,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.81"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 36
        }
      ]
    },
    {
      "metadata": {
        "id": "9V3tXdHUxPdg",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "66b56fa4-9d65-4090-804e-637fa78b4adb"
      },
      "cell_type": "code",
      "source": [
        "len(tok.word_index)"
      ],
      "execution_count": 37,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "1878"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 37
        }
      ]
    },
    {
      "metadata": {
        "id": "ByJVYy_KdCd2",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "## 언어 모형"
      ]
    },
    {
      "metadata": {
        "id": "zUVnTuih1Xwy",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "tok.word_index['<START>'] = start = len(tok.word_index) + 1\n",
        "tok.index_word[start] = '<START>'\n",
        "\n",
        "tok.word_index['<END>'] = end = len(tok.word_index) + 1\n",
        "tok.index_word[end] = '<END>'"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "WOJ1mcFP1m5-",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "16feb203-dbda-408e-9d71-8ffa7c5c8c78"
      },
      "cell_type": "code",
      "source": [
        "len(tok.word_index)"
      ],
      "execution_count": 46,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "1880"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 46
        }
      ]
    },
    {
      "metadata": {
        "id": "qqlF7W7T1qLm",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "prev_seq = []\n",
        "next_seq = []\n",
        "for s in seq:\n",
        "    prev_seq.append([start] + s)  # 입력 문장 앞에 시작 표시\n",
        "    next_seq.append(s + [end])    # 출력 문장 뒤에 끝 표시"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "G4w9PX0X2rWW",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "MAXLEN = max(len(s) for s in prev_seq)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "_jtnzqPw3Cui",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "from keras.preprocessing.sequence import pad_sequences"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "CH4Jxqjo2w6m",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "prev_pad = pad_sequences(prev_seq, MAXLEN, padding='post')\n",
        "next_pad = pad_sequences(next_seq, MAXLEN, padding='post')"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "yMXffyBN3Gzr",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "from sklearn.model_selection import train_test_split"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "R-oP3EDm3Q7Y",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "x_train, x_test, y_train, y_test = train_test_split(prev_pad, next_pad, test_size=.2, random_state=1234)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "dYJgi-Za3Tf7",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "NUM_WORDS = len(tok.index_word) + 1"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "gYRnDLp-4w5K",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "from keras.layers import TimeDistributed"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "EDePwhfb3XHl",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "rnn = Sequential()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "_Uzivj4C3a-d",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "rnn.add(Embedding(input_dim=NUM_WORDS, output_dim=8, input_length=MAXLEN, mask_zero=True))\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "4PHuAy2K3cX3",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "rnn.add(LSTM(16, return_sequences=True))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "TdS9G90I3jPf",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "rnn.add(TimeDistributed(Dense(NUM_WORDS, activation='softmax')))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "XUnz8IkE4uZq",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 249
        },
        "outputId": "f0031975-dec6-4606-e191-4cf59060ac94"
      },
      "cell_type": "code",
      "source": [
        "rnn.summary()\n"
      ],
      "execution_count": 59,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "embedding_2 (Embedding)      (None, 31, 8)             15048     \n",
            "_________________________________________________________________\n",
            "lstm_2 (LSTM)                (None, 31, 16)            1600      \n",
            "_________________________________________________________________\n",
            "time_distributed_1 (TimeDist (None, 31, 1881)          31977     \n",
            "=================================================================\n",
            "Total params: 48,625\n",
            "Trainable params: 48,625\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "id": "ehRrUtEj40Mg",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "669185f7-d165-40a5-b7fe-ce3b8d01602f"
      },
      "cell_type": "code",
      "source": [
        "y_train.shape"
      ],
      "execution_count": 60,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(800, 31)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 60
        }
      ]
    },
    {
      "metadata": {
        "id": "ElxIhz3p44gX",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "import numpy"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "75-DuB1i5R1P",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "y_train_dims = numpy.expand_dims(y_train, 2)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "YycUb-oc5WkU",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "1dfffb68-313e-4015-ce7d-e8b48a9a2049"
      },
      "cell_type": "code",
      "source": [
        "y_train_dims.shape"
      ],
      "execution_count": 63,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(800, 31, 1)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 63
        }
      ]
    },
    {
      "metadata": {
        "id": "VqD2JP_Z5X9u",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "rnn.compile(optimizer=Adam(lr=.1),\n",
        "            loss='sparse_categorical_crossentropy', \n",
        "            metrics=['accuracy'],\n",
        "            sample_weight_mode='temporal')\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "3Iy-ZM_p6qQU",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 392
        },
        "outputId": "9ef5a0d3-a88f-48bb-d307-38abd62b6aeb"
      },
      "cell_type": "code",
      "source": [
        "rnn.fit(x_train, y_train_dims, epochs=10)"
      ],
      "execution_count": 65,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Epoch 1/10\n",
            "800/800 [==============================] - 3s 4ms/step - loss: 6.8000 - acc: 0.0767\n",
            "Epoch 2/10\n",
            "800/800 [==============================] - 2s 3ms/step - loss: 5.9306 - acc: 0.1116\n",
            "Epoch 3/10\n",
            "800/800 [==============================] - 2s 3ms/step - loss: 5.7212 - acc: 0.1149\n",
            "Epoch 4/10\n",
            "800/800 [==============================] - 2s 3ms/step - loss: 5.5417 - acc: 0.1211\n",
            "Epoch 5/10\n",
            "800/800 [==============================] - 2s 3ms/step - loss: 5.3669 - acc: 0.1255\n",
            "Epoch 6/10\n",
            "800/800 [==============================] - 2s 3ms/step - loss: 5.2188 - acc: 0.1259\n",
            "Epoch 7/10\n",
            "800/800 [==============================] - 2s 3ms/step - loss: 5.0893 - acc: 0.1320\n",
            "Epoch 8/10\n",
            "800/800 [==============================] - 2s 3ms/step - loss: 4.9350 - acc: 0.1401\n",
            "Epoch 9/10\n",
            "800/800 [==============================] - 2s 3ms/step - loss: 4.8116 - acc: 0.1502\n",
            "Epoch 10/10\n",
            "800/800 [==============================] - 2s 3ms/step - loss: 4.6862 - acc: 0.1607\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<keras.callbacks.History at 0x7f27c03c74e0>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 65
        }
      ]
    },
    {
      "metadata": {
        "id": "5pfFnHTc6q42",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "new_sentence = [prev_seq[0][:10]]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "itOFISCc7N5B",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "e1bd14ac-0630-4abd-d15b-ae4de748289d"
      },
      "cell_type": "code",
      "source": [
        "new_sentence"
      ],
      "execution_count": 67,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[[1879, 33, 117, 5, 53, 214, 11, 47, 8, 155]]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 67
        }
      ]
    },
    {
      "metadata": {
        "id": "TA1XuEy47l0j",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "new_pad = pad_sequences(new_sentence, MAXLEN, padding='post')"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "FVx1qRRX79Ln",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "pred = rnn.predict(new_pad)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "npEMxCNF8BY6",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "dc767a0f-f85b-471a-f841-cfc066ad5532"
      },
      "cell_type": "code",
      "source": [
        "pred[0, 9, :].argmax()"
      ],
      "execution_count": 70,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "4"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 70
        }
      ]
    },
    {
      "metadata": {
        "id": "dP7LyRuO8CSm",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "606d3a08-4bc8-4006-ec18-460ccd03b350"
      },
      "cell_type": "code",
      "source": [
        "tok.index_word[4]"
      ],
      "execution_count": 71,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'it'"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 71
        }
      ]
    },
    {
      "metadata": {
        "id": "86QSteEDffHG",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "새로운 문장으로 다음 단어 예측하기"
      ]
    },
    {
      "metadata": {
        "id": "y0olSHxjAg0C",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "new_seq = tok.texts_to_sequences(['I am happy'])"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "DJrF4Sz3fJ3a",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "모든 문장의 앞에 `<START>` 표시를 끼워넣는다"
      ]
    },
    {
      "metadata": {
        "id": "i75Q5MFue6Qz",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "for s in new_seq:\n",
        "  s.insert(0, start)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "Tq3uEjICe8mC",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "be2d564e-2ca3-4725-9dcd-eab4776b8964"
      },
      "cell_type": "code",
      "source": [
        "new_seq"
      ],
      "execution_count": 80,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[[1879, 2, 82, 114]]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 80
        }
      ]
    },
    {
      "metadata": {
        "id": "-FLrqfDYexp7",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "pad_seq = pad_sequences(new_seq, MAXLEN, padding='post')"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "wTbgYRgNeybS",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "pred = rnn.predict(pad_seq)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "Ff6xddFZfSvD",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "word_num = pred[0, 3, :].argmax()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "bVq3fojhfaAi",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "87dedb97-3f56-4201-d81a-261377d86e9e"
      },
      "cell_type": "code",
      "source": [
        "word_num"
      ],
      "execution_count": 86,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "14"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 86
        }
      ]
    },
    {
      "metadata": {
        "id": "2gXS5wcQfV-a",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "a8f5f5f4-4f5f-40fd-f96f-1c81c52dfab4"
      },
      "cell_type": "code",
      "source": [
        "tok.index_word[word_num]"
      ],
      "execution_count": 87,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'with'"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 87
        }
      ]
    }
  ]
}