Skip to content

Instantly share code, notes, and snippets.

@lasershow
Last active September 4, 2017 08:29
Show Gist options
  • Save lasershow/13145277a48f9aebc7ccfbc25a1b251f to your computer and use it in GitHub Desktop.
Save lasershow/13145277a48f9aebc7ccfbc25a1b251f to your computer and use it in GitHub Desktop.
迷惑メール 回答例
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"df = pd.read_table(\n",
" 'smsspamcollection/SMSSpamCollection', \n",
" names=['label', 'message']\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label</th>\n",
" <th>message</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ham</td>\n",
" <td>Go until jurong point, crazy.. Available only ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ham</td>\n",
" <td>Ok lar... Joking wif u oni...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>spam</td>\n",
" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ham</td>\n",
" <td>U dun say so early hor... U c already then say...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ham</td>\n",
" <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" label message\n",
"0 ham Go until jurong point, crazy.. Available only ...\n",
"1 ham Ok lar... Joking wif u oni...\n",
"2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n",
"3 ham U dun say so early hor... U c already then say...\n",
"4 ham Nah I don't think he goes to usf, he lives aro..."
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 5572 entries, 0 to 5571\n",
"Data columns (total 2 columns):\n",
"label 5572 non-null object\n",
"message 5572 non-null object\n",
"dtypes: object(2)\n",
"memory usage: 87.1+ KB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(5572, 2)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label</th>\n",
" <th>message</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>Go until jurong point, crazy.. Available only ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>Ok lar... Joking wif u oni...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>U dun say so early hor... U c already then say...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" label message\n",
"0 0 Go until jurong point, crazy.. Available only ...\n",
"1 0 Ok lar... Joking wif u oni...\n",
"2 1 Free entry in 2 a wkly comp to win FA Cup fina...\n",
"3 0 U dun say so early hor... U c already then say...\n",
"4 0 Nah I don't think he goes to usf, he lives aro..."
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['label'] = df.label.map({'ham':0, 'spam':1})\n",
"print(df.shape)\n",
"df.head() "
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"count_vec_sample = CountVectorizer()"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"messages = ['Thank you for calling.',\n",
" 'Thank you for your inquiry.',\n",
" 'Thanks for keeping in touch.',\n",
" 'Thanks for getting in touch with me?']"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
" ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
" strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
" tokenizer=None, vocabulary=None)"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"count_vec_sample.fit(messages)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": true,
"scrolled": true
},
"outputs": [],
"source": [
"data = count_vec_sample.transform(messages)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"matrix([[1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],\n",
" [0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1],\n",
" [0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0],\n",
" [0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0]])"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.todense() "
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"count_vec = CountVectorizer()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
" ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
" strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
" tokenizer=None, vocabulary=None)"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"count_vec.fit(df.message)\n",
"\n",
"#出力する場合はコメントをはずしてください。\n",
"# count_vec.vocabulary_"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(df['message'], \n",
" df['label'], \n",
" random_state=1)\n",
"\n",
"count_vector = CountVectorizer()\n",
"count_vector.fit(X_train)\n",
"X_train = count_vector.transform(X_train)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"# count_vector.vocabulary_"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X_test = count_vector.transform(X_test)\n"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.naive_bayes import MultinomialNB\n",
"naive_bayes = MultinomialNB()\n",
"naive_bayes.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"predictions = naive_bayes.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy score: 0.9885139985642498\n",
"Precision score: 0.9720670391061452\n",
"Recall score: 0.9405405405405406\n",
"F1 score: 0.9560439560439562\n"
]
}
],
"source": [
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
"print('Accuracy score: ', format(accuracy_score(y_test, predictions)))\n",
"print('Precision score: ', format(precision_score(y_test, predictions)))\n",
"print('Recall score: ', format(recall_score(y_test, predictions)))\n",
"print('F1 score: ', format(f1_score(y_test, predictions)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 評価関数について\n",
"Accuracy score、Recallなどを見ていますが、実際にはどの評価関数を重要視するかを決定する必要があります。\n",
"例えば、Recallの場合、\n",
"\n",
"\n",
"recall = (スパムであると正しく判定できたメール数)/(迷惑メールの全体の数)\n",
"\n",
"\n",
"であるので、迷惑メールをより検出することに、重きを置いているといえます。\n",
"\n",
"\n",
"precision = (スパムと判定したメールのうち実際にスパムであった数)/(フィルターがスパムと判定したメール数)\n",
"\n",
"\n",
"逆にPrecisionは迷惑メールがきちんと迷惑メールであるかということに重きを置いています。\n",
"迷惑メールの場合、スパムで無いメールをスパムと認識しては、まずいのでPrecisionを重要視するなどの評価関数の使い分けが必要です。\n"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment