Created
November 3, 2018 12:14
-
-
Save euphoris/df4228d6a4597366535cd0a86ec88611 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "practice10.ipynb", | |
"version": "0.3.2", | |
"provenance": [], | |
"collapsed_sections": [] | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"accelerator": "GPU" | |
}, | |
"cells": [ | |
{ | |
"metadata": { | |
"id": "pzozBwWQiedL", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "76a21baa-345e-4550-91c1-ee72af6dd22c" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"import keras\n", | |
"import pandas as pd" | |
], | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Using TensorFlow backend.\n" | |
], | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "bvN-e4i1kQ5i", | |
"colab_type": "text" | |
}, | |
"cell_type": "markdown", | |
"source": [ | |
"데이터 파일을 다운로드" | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "I77on9FQip4U", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 214 | |
}, | |
"outputId": "6ed7e71a-ae46-46c7-f2a7-ffe9ae17bb1c" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip" | |
], | |
"execution_count": 2, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"--2018-11-03 12:01:54-- https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip\n", | |
"Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.249\n", | |
"Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.249|:443... connected.\n", | |
"HTTP request sent, awaiting response... 200 OK\n", | |
"Length: 84188 (82K) [application/zip]\n", | |
"Saving to: ‘sentiment labelled sentences.zip’\n", | |
"\n", | |
"\r sentiment 0%[ ] 0 --.-KB/s \r sentiment 29%[====> ] 24.00K 84.9KB/s \rsentiment labelled 100%[===================>] 82.21K 194KB/s in 0.4s \n", | |
"\n", | |
"2018-11-03 12:01:55 (194 KB/s) - ‘sentiment labelled sentences.zip’ saved [84188/84188]\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "MuTrA0Z-kS7i", | |
"colab_type": "text" | |
}, | |
"cell_type": "markdown", | |
"source": [ | |
"압축을 푼다" | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "8b3vQJVVi7_V", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 249 | |
}, | |
"outputId": "fc8f3b71-6274-49c1-ceac-49b4134d491b" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"!unzip sentiment\\ labelled\\ sentences.zip" | |
], | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Archive: sentiment labelled sentences.zip\n", | |
" creating: sentiment labelled sentences/\n", | |
" inflating: sentiment labelled sentences/.DS_Store \n", | |
" creating: __MACOSX/\n", | |
" creating: __MACOSX/sentiment labelled sentences/\n", | |
" inflating: __MACOSX/sentiment labelled sentences/._.DS_Store \n", | |
" inflating: sentiment labelled sentences/amazon_cells_labelled.txt \n", | |
" inflating: sentiment labelled sentences/imdb_labelled.txt \n", | |
" inflating: __MACOSX/sentiment labelled sentences/._imdb_labelled.txt \n", | |
" inflating: sentiment labelled sentences/readme.txt \n", | |
" inflating: __MACOSX/sentiment labelled sentences/._readme.txt \n", | |
" inflating: sentiment labelled sentences/yelp_labelled.txt \n", | |
" inflating: __MACOSX/._sentiment labelled sentences \n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "UiTZPNR8jOUb", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"df = pd.read_csv('sentiment labelled sentences/amazon_cells_labelled.txt',\n", | |
" sep='\\t',\n", | |
" header=None)\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "H31jcemljQnI", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 206 | |
}, | |
"outputId": "3537e414-9e8c-4c65-aa03-cef346c7bb61" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"df.head()" | |
], | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>So there is no way for me to plug it in here i...</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Good case, Excellent value.</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Great for the jawbone.</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Tied to charger for conversations lasting more...</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>The mic is great.</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" 0 1\n", | |
"0 So there is no way for me to plug it in here i... 0\n", | |
"1 Good case, Excellent value. 1\n", | |
"2 Great for the jawbone. 1\n", | |
"3 Tied to charger for conversations lasting more... 0\n", | |
"4 The mic is great. 1" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 5 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "EQvKro5pjZ1X", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"tok = keras.preprocessing.text.Tokenizer()\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "X2IFMgh_l3_X", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"tok.fit_on_texts(df[0])" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "rLxI_KAsl_pg", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "226d951e-a3dd-4cc7-a71b-c59099c1e608" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"tok.word_index['plug']" | |
], | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"155" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 8 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "aTA1RSoWmESC", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"tok.index_word = {i: w for w, i in tok.word_index.items()}" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "I3OS5mTBmOuF", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "c8fa9528-627a-4edd-e71e-433281fd1757" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"tok.index_word[200]" | |
], | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"'broke'" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 10 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "fF2Vk_sqmglO", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"seq = tok.texts_to_sequences(df[0])" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "qgHbBcg5myfb", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "45255621-c6f3-4332-ba10-6d77405c499a" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"df.loc[0,0]" | |
], | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"'So there is no way for me to plug it in here in the US unless I go by a converter.'" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 12 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "X0casrGRm2tz", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 392 | |
}, | |
"outputId": "792eaa8c-9041-4929-b9e2-9c62e6321786" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"seq[0]" | |
], | |
"execution_count": 13, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"[33,\n", | |
" 117,\n", | |
" 5,\n", | |
" 53,\n", | |
" 214,\n", | |
" 11,\n", | |
" 47,\n", | |
" 8,\n", | |
" 155,\n", | |
" 4,\n", | |
" 19,\n", | |
" 337,\n", | |
" 19,\n", | |
" 1,\n", | |
" 546,\n", | |
" 416,\n", | |
" 2,\n", | |
" 241,\n", | |
" 190,\n", | |
" 6,\n", | |
" 812]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 13 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "qYP29CHIm1RH", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"MAXLEN = max(len(s) for s in seq)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "nl-PVBbCn_AE", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "1dfd93f4-c3d9-40d9-829e-9c31ea6cd2da" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"MAXLEN" | |
], | |
"execution_count": 15, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"30" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 15 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "v1NpdswWn_6g", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"pad = keras.preprocessing.sequence.pad_sequences(seq, MAXLEN)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "ad5emDn-onGd", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 71 | |
}, | |
"outputId": "619252b1-42d7-4017-f387-5de200533ed8" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"pad[0]" | |
], | |
"execution_count": 17, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 33, 117, 5, 53,\n", | |
" 214, 11, 47, 8, 155, 4, 19, 337, 19, 1, 546, 416, 2,\n", | |
" 241, 190, 6, 812], dtype=int32)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 17 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "G39RNvJSooH6", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"from sklearn.model_selection import train_test_split\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "sAzRyOg_o_Zc", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"X_train, X_test, y_train, y_test = train_test_split(\n", | |
" pad, df[1], test_size=.2, random_state=1234)\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "1xr0mtT9pCh2", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"NUM_WORDS = len(tok.index_word) + 1" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "exAeqqpjtrsZ", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "d02a3316-976e-4c84-e983-3dc1273efb82" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"NUM_WORDS" | |
], | |
"execution_count": 21, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"1879" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 21 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "OnoccqO_tvGj", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"from keras.models import Sequential\n", | |
"from keras.layers import Dense, Embedding, LSTM" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "2972S2uUtzL3", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"rnn = Sequential()" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "DN75ai0Nt1vS", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"rnn.add(Embedding(input_dim=NUM_WORDS, \n", | |
" output_dim=8, \n", | |
" input_length=MAXLEN,\n", | |
" mask_zero=True))" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "WBNqePuduuKp", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"rnn.add(LSTM(16, return_sequences=False))" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "6IsojJkRuxhi", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"rnn.add(Dense(1, activation='sigmoid'))" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "-zH5_REwvyEF", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"from keras.optimizers import Adam" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "fR_LmYd8wBw6", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"rnn.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "V1N11tUTwHMP", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 392 | |
}, | |
"outputId": "48344325-8316-485e-cef9-b7432ad8aa6c" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"rnn.fit(X_train, y_train, epochs=10)" | |
], | |
"execution_count": 29, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Epoch 1/10\n", | |
"800/800 [==============================] - 5s 6ms/step - loss: 0.6924 - acc: 0.5425\n", | |
"Epoch 2/10\n", | |
"800/800 [==============================] - 2s 3ms/step - loss: 0.6864 - acc: 0.7238\n", | |
"Epoch 3/10\n", | |
"800/800 [==============================] - 2s 3ms/step - loss: 0.6540 - acc: 0.8687\n", | |
"Epoch 4/10\n", | |
"800/800 [==============================] - 2s 3ms/step - loss: 0.5475 - acc: 0.8687\n", | |
"Epoch 5/10\n", | |
"800/800 [==============================] - 2s 3ms/step - loss: 0.4368 - acc: 0.9037\n", | |
"Epoch 6/10\n", | |
"800/800 [==============================] - 2s 3ms/step - loss: 0.3396 - acc: 0.9387\n", | |
"Epoch 7/10\n", | |
"800/800 [==============================] - 2s 3ms/step - loss: 0.2856 - acc: 0.9525\n", | |
"Epoch 8/10\n", | |
"800/800 [==============================] - 2s 3ms/step - loss: 0.2335 - acc: 0.9688\n", | |
"Epoch 9/10\n", | |
"800/800 [==============================] - 2s 3ms/step - loss: 0.2052 - acc: 0.9700\n", | |
"Epoch 10/10\n", | |
"800/800 [==============================] - 2s 3ms/step - loss: 0.1739 - acc: 0.9725\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<keras.callbacks.History at 0x7f27c541d240>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 29 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "l1kjZgp1wKO5", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"y_rnn = rnn.predict_classes(X_test)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "1hkqxLSPw3OB", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 71 | |
}, | |
"outputId": "6562b179-69da-4d56-c3d5-2bd16be3b5cc" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"X_test[0]" | |
], | |
"execution_count": 31, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", | |
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", | |
" 0, 0, 0, 136, 306, 1525, 5, 415], dtype=int32)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 31 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "Q86wV9Muw5r5", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "f280e63e-1e4a-468a-a2f4-2e37c7dcd204" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"tok.index_word[415]" | |
], | |
"execution_count": 32, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"'awful'" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 32 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "XM7hFj3ow-U0", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "a23c5e73-4e7a-4ee8-d543-6b482ab18e29" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"y_rnn[0]" | |
], | |
"execution_count": 33, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"array([1], dtype=int32)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 33 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "EbS9rrGHxHRU", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"from sklearn.metrics import accuracy_score" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "YrGa7qMZxNqI", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "12d38c4b-870e-40d5-c0eb-71155ab010ed" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"accuracy_score(y_test, y_rnn)\n" | |
], | |
"execution_count": 36, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"0.81" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 36 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "9V3tXdHUxPdg", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "66b56fa4-9d65-4090-804e-637fa78b4adb" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"len(tok.word_index)" | |
], | |
"execution_count": 37, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"1878" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 37 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "ByJVYy_KdCd2", | |
"colab_type": "text" | |
}, | |
"cell_type": "markdown", | |
"source": [ | |
"## 언어 모형" | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "zUVnTuih1Xwy", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"tok.word_index['<START>'] = start = len(tok.word_index) + 1\n", | |
"tok.index_word[start] = '<START>'\n", | |
"\n", | |
"tok.word_index['<END>'] = end = len(tok.word_index) + 1\n", | |
"tok.index_word[end] = '<END>'" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "WOJ1mcFP1m5-", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "16feb203-dbda-408e-9d71-8ffa7c5c8c78" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"len(tok.word_index)" | |
], | |
"execution_count": 46, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"1880" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 46 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "qqlF7W7T1qLm", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"prev_seq = []\n", | |
"next_seq = []\n", | |
"for s in seq:\n", | |
" prev_seq.append([start] + s) # 입력 문장 앞에 시작 표시\n", | |
" next_seq.append(s + [end]) # 출력 문장 뒤에 끝 표시" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "G4w9PX0X2rWW", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"MAXLEN = max(len(s) for s in prev_seq)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "_jtnzqPw3Cui", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"from keras.preprocessing.sequence import pad_sequences" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "CH4Jxqjo2w6m", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"prev_pad = pad_sequences(prev_seq, MAXLEN, padding='post')\n", | |
"next_pad = pad_sequences(next_seq, MAXLEN, padding='post')" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "yMXffyBN3Gzr", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"from sklearn.model_selection import train_test_split" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "R-oP3EDm3Q7Y", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"x_train, x_test, y_train, y_test = train_test_split(prev_pad, next_pad, test_size=.2, random_state=1234)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "dYJgi-Za3Tf7", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"NUM_WORDS = len(tok.index_word) + 1" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "gYRnDLp-4w5K", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"from keras.layers import TimeDistributed" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "EDePwhfb3XHl", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"rnn = Sequential()" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "_Uzivj4C3a-d", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"rnn.add(Embedding(input_dim=NUM_WORDS, output_dim=8, input_length=MAXLEN, mask_zero=True))\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "4PHuAy2K3cX3", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"rnn.add(LSTM(16, return_sequences=True))" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "TdS9G90I3jPf", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"rnn.add(TimeDistributed(Dense(NUM_WORDS, activation='softmax')))" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "XUnz8IkE4uZq", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 249 | |
}, | |
"outputId": "f0031975-dec6-4606-e191-4cf59060ac94" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"rnn.summary()\n" | |
], | |
"execution_count": 59, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"_________________________________________________________________\n", | |
"Layer (type) Output Shape Param # \n", | |
"=================================================================\n", | |
"embedding_2 (Embedding) (None, 31, 8) 15048 \n", | |
"_________________________________________________________________\n", | |
"lstm_2 (LSTM) (None, 31, 16) 1600 \n", | |
"_________________________________________________________________\n", | |
"time_distributed_1 (TimeDist (None, 31, 1881) 31977 \n", | |
"=================================================================\n", | |
"Total params: 48,625\n", | |
"Trainable params: 48,625\n", | |
"Non-trainable params: 0\n", | |
"_________________________________________________________________\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "ehRrUtEj40Mg", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "669185f7-d165-40a5-b7fe-ce3b8d01602f" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"y_train.shape" | |
], | |
"execution_count": 60, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(800, 31)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 60 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "ElxIhz3p44gX", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"import numpy" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "75-DuB1i5R1P", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"y_train_dims = numpy.expand_dims(y_train, 2)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "YycUb-oc5WkU", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "1dfffb68-313e-4015-ce7d-e8b48a9a2049" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"y_train_dims.shape" | |
], | |
"execution_count": 63, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(800, 31, 1)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 63 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "VqD2JP_Z5X9u", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"rnn.compile(optimizer=Adam(lr=.1),\n", | |
" loss='sparse_categorical_crossentropy', \n", | |
" metrics=['accuracy'],\n", | |
" sample_weight_mode='temporal')\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "3Iy-ZM_p6qQU", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 392 | |
}, | |
"outputId": "9ef5a0d3-a88f-48bb-d307-38abd62b6aeb" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"rnn.fit(x_train, y_train_dims, epochs=10)" | |
], | |
"execution_count": 65, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Epoch 1/10\n", | |
"800/800 [==============================] - 3s 4ms/step - loss: 6.8000 - acc: 0.0767\n", | |
"Epoch 2/10\n", | |
"800/800 [==============================] - 2s 3ms/step - loss: 5.9306 - acc: 0.1116\n", | |
"Epoch 3/10\n", | |
"800/800 [==============================] - 2s 3ms/step - loss: 5.7212 - acc: 0.1149\n", | |
"Epoch 4/10\n", | |
"800/800 [==============================] - 2s 3ms/step - loss: 5.5417 - acc: 0.1211\n", | |
"Epoch 5/10\n", | |
"800/800 [==============================] - 2s 3ms/step - loss: 5.3669 - acc: 0.1255\n", | |
"Epoch 6/10\n", | |
"800/800 [==============================] - 2s 3ms/step - loss: 5.2188 - acc: 0.1259\n", | |
"Epoch 7/10\n", | |
"800/800 [==============================] - 2s 3ms/step - loss: 5.0893 - acc: 0.1320\n", | |
"Epoch 8/10\n", | |
"800/800 [==============================] - 2s 3ms/step - loss: 4.9350 - acc: 0.1401\n", | |
"Epoch 9/10\n", | |
"800/800 [==============================] - 2s 3ms/step - loss: 4.8116 - acc: 0.1502\n", | |
"Epoch 10/10\n", | |
"800/800 [==============================] - 2s 3ms/step - loss: 4.6862 - acc: 0.1607\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<keras.callbacks.History at 0x7f27c03c74e0>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 65 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "5pfFnHTc6q42", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"new_sentence = [prev_seq[0][:10]]" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "itOFISCc7N5B", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "e1bd14ac-0630-4abd-d15b-ae4de748289d" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"new_sentence" | |
], | |
"execution_count": 67, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"[[1879, 33, 117, 5, 53, 214, 11, 47, 8, 155]]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 67 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "TA1XuEy47l0j", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"new_pad = pad_sequences(new_sentence, MAXLEN, padding='post')" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "FVx1qRRX79Ln", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"pred = rnn.predict(new_pad)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "npEMxCNF8BY6", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "dc767a0f-f85b-471a-f841-cfc066ad5532" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"pred[0, 9, :].argmax()" | |
], | |
"execution_count": 70, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"4" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 70 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "dP7LyRuO8CSm", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "606d3a08-4bc8-4006-ec18-460ccd03b350" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"tok.index_word[4]" | |
], | |
"execution_count": 71, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"'it'" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 71 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "86QSteEDffHG", | |
"colab_type": "text" | |
}, | |
"cell_type": "markdown", | |
"source": [ | |
"새로운 문장으로 다음 단어 예측하기" | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "y0olSHxjAg0C", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"new_seq = tok.texts_to_sequences(['I am happy'])" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "DJrF4Sz3fJ3a", | |
"colab_type": "text" | |
}, | |
"cell_type": "markdown", | |
"source": [ | |
"모든 문장의 앞에 `<START>` 표시를 끼워넣는다" | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "i75Q5MFue6Qz", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"for s in new_seq:\n", | |
" s.insert(0, start)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "Tq3uEjICe8mC", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "be2d564e-2ca3-4725-9dcd-eab4776b8964" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"new_seq" | |
], | |
"execution_count": 80, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"[[1879, 2, 82, 114]]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 80 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "-FLrqfDYexp7", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"pad_seq = pad_sequences(new_seq, MAXLEN, padding='post')" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "wTbgYRgNeybS", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"pred = rnn.predict(pad_seq)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "Ff6xddFZfSvD", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"word_num = pred[0, 3, :].argmax()" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "bVq3fojhfaAi", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "87dedb97-3f56-4201-d81a-261377d86e9e" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"word_num" | |
], | |
"execution_count": 86, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"14" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 86 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "2gXS5wcQfV-a", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "a8f5f5f4-4f5f-40fd-f96f-1c81c52dfab4" | |
}, | |
"cell_type": "code", | |
"source": [ | |
"tok.index_word[word_num]" | |
], | |
"execution_count": 87, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"'with'" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 87 | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment