Last active
September 4, 2017 08:29
-
-
Save lasershow/13145277a48f9aebc7ccfbc25a1b251f to your computer and use it in GitHub Desktop.
迷惑メール 回答例
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"df = pd.read_table(\n", | |
" 'smsspamcollection/SMSSpamCollection', \n", | |
" names=['label', 'message']\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>label</th>\n", | |
" <th>message</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>ham</td>\n", | |
" <td>Go until jurong point, crazy.. Available only ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>ham</td>\n", | |
" <td>Ok lar... Joking wif u oni...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>spam</td>\n", | |
" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>ham</td>\n", | |
" <td>U dun say so early hor... U c already then say...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>ham</td>\n", | |
" <td>Nah I don't think he goes to usf, he lives aro...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" label message\n", | |
"0 ham Go until jurong point, crazy.. Available only ...\n", | |
"1 ham Ok lar... Joking wif u oni...\n", | |
"2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n", | |
"3 ham U dun say so early hor... U c already then say...\n", | |
"4 ham Nah I don't think he goes to usf, he lives aro..." | |
] | |
}, | |
"execution_count": 55, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 56, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"RangeIndex: 5572 entries, 0 to 5571\n", | |
"Data columns (total 2 columns):\n", | |
"label 5572 non-null object\n", | |
"message 5572 non-null object\n", | |
"dtypes: object(2)\n", | |
"memory usage: 87.1+ KB\n" | |
] | |
} | |
], | |
"source": [ | |
"df.info()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(5572, 2)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>label</th>\n", | |
" <th>message</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0</td>\n", | |
" <td>Go until jurong point, crazy.. Available only ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>0</td>\n", | |
" <td>Ok lar... Joking wif u oni...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>0</td>\n", | |
" <td>U dun say so early hor... U c already then say...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>0</td>\n", | |
" <td>Nah I don't think he goes to usf, he lives aro...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" label message\n", | |
"0 0 Go until jurong point, crazy.. Available only ...\n", | |
"1 0 Ok lar... Joking wif u oni...\n", | |
"2 1 Free entry in 2 a wkly comp to win FA Cup fina...\n", | |
"3 0 U dun say so early hor... U c already then say...\n", | |
"4 0 Nah I don't think he goes to usf, he lives aro..." | |
] | |
}, | |
"execution_count": 57, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df['label'] = df.label.map({'ham':0, 'spam':1})\n", | |
"print(df.shape)\n", | |
"df.head() " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 58, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.feature_extraction.text import CountVectorizer\n", | |
"count_vec_sample = CountVectorizer()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 59, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"messages = ['Thank you for calling.',\n", | |
" 'Thank you for your inquiry.',\n", | |
" 'Thanks for keeping in touch.',\n", | |
" 'Thanks for getting in touch with me?']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 60, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", | |
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n", | |
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", | |
" ngram_range=(1, 1), preprocessor=None, stop_words=None,\n", | |
" strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", | |
" tokenizer=None, vocabulary=None)" | |
] | |
}, | |
"execution_count": 60, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"count_vec_sample.fit(messages)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 61, | |
"metadata": { | |
"collapsed": true, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"data = count_vec_sample.transform(messages)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 62, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"matrix([[1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],\n", | |
" [0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1],\n", | |
" [0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0],\n", | |
" [0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0]])" | |
] | |
}, | |
"execution_count": 62, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data.todense() " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 63, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"count_vec = CountVectorizer()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 64, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", | |
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n", | |
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", | |
" ngram_range=(1, 1), preprocessor=None, stop_words=None,\n", | |
" strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", | |
" tokenizer=None, vocabulary=None)" | |
] | |
}, | |
"execution_count": 64, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"count_vec.fit(df.message)\n", | |
"\n", | |
"#出力する場合はコメントをはずしてください。\n", | |
"# count_vec.vocabulary_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 65, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.model_selection import train_test_split\n", | |
"\n", | |
"X_train, X_test, y_train, y_test = train_test_split(df['message'], \n", | |
" df['label'], \n", | |
" random_state=1)\n", | |
"\n", | |
"count_vector = CountVectorizer()\n", | |
"count_vector.fit(X_train)\n", | |
"X_train = count_vector.transform(X_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 66, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# count_vector.vocabulary_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 67, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"X_test = count_vector.transform(X_test)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 68, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)" | |
] | |
}, | |
"execution_count": 68, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from sklearn.naive_bayes import MultinomialNB\n", | |
"naive_bayes = MultinomialNB()\n", | |
"naive_bayes.fit(X_train, y_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 69, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"predictions = naive_bayes.predict(X_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 70, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Accuracy score: 0.9885139985642498\n", | |
"Precision score: 0.9720670391061452\n", | |
"Recall score: 0.9405405405405406\n", | |
"F1 score: 0.9560439560439562\n" | |
] | |
} | |
], | |
"source": [ | |
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n", | |
"print('Accuracy score: ', format(accuracy_score(y_test, predictions)))\n", | |
"print('Precision score: ', format(precision_score(y_test, predictions)))\n", | |
"print('Recall score: ', format(recall_score(y_test, predictions)))\n", | |
"print('F1 score: ', format(f1_score(y_test, predictions)))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 評価関数について\n", | |
"Accuracy score、Recallなどを見ていますが、実際にはどの評価関数を重要視するかを決定する必要があります。\n", | |
"例えば、Recallの場合、\n", | |
"\n", | |
"\n", | |
"recall = (スパムであると正しく判定できたメール数)/(迷惑メールの全体の数)\n", | |
"\n", | |
"\n", | |
"であるので、迷惑メールをより検出することに、重きを置いているといえます。\n", | |
"\n", | |
"\n", | |
"precision = (スパムと判定したメールのうち実際にスパムであった数)/(フィルターがスパムと判定したメール数)\n", | |
"\n", | |
"\n", | |
"逆にPrecisionは迷惑メールがきちんと迷惑メールであるかということに重きを置いています。\n", | |
"迷惑メールの場合、スパムで無いメールをスパムと認識しては、まずいのでPrecisionを重要視するなどの評価関数の使い分けが必要です。\n" | |
] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment