迷惑メール 回答例
"import pandas as pd\n",
"df = pd.read_table(\n",
" 'smsspamcollection/SMSSpamCollection', \n",
" names=['label', 'message']\n",
" )"
" <tr>\n",
" <th>0</th>\n",
" <td>ham</td>\n",
" <td>Go until jurong point, crazy.. Available only ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ham</td>\n",
" <td>Ok lar... Joking wif u oni...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>spam</td>\n",
" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ham</td>\n",
" <td>U dun say so early hor... U c already then say...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ham</td>\n",
" <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
" </tr>\n",
" </tbody>\n",
"text/plain": [
" label message\n",
"0 ham Go until jurong point, crazy.. Available only ...\n",
"1 ham Ok lar... Joking wif u oni...\n",
"2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n",
"3 ham U dun say so early hor... U c already then say...\n",
"4 ham Nah I don't think he goes to usf, he lives aro..."
"source": [
"outputs": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 5572 entries, 0 to 5571\n",
"Data columns (total 2 columns):\n",
"label 5572 non-null object\n",
"message 5572 non-null object\n",
"dtypes: object(2)\n",
"memory usage: 87.1+ KB\n"
"source": [
"name": "stdout",
"output_type": "stream",
"text": [
"(5572, 2)\n"
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>Go until jurong point, crazy.. Available only ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>Ok lar... Joking wif u oni...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>U dun say so early hor... U c already then say...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
" </tr>\n",
" </tbody>\n",
"text/plain": [
" label message\n",
"0 0 Go until jurong point, crazy.. Available only ...\n",
"1 0 Ok lar... Joking wif u oni...\n",
"2 1 Free entry in 2 a wkly comp to win FA Cup fina...\n",
"3 0 U dun say so early hor... U c already then say...\n",
"4 0 Nah I don't think he goes to usf, he lives aro..."
"source": [
"df['label'] ={'ham':0, 'spam':1})\n",
"df.head() "
"from sklearn.feature_extraction.text import CountVectorizer\n",
"count_vec_sample = CountVectorizer()"
"source": [
"messages = ['Thank you for calling.',\n",
" 'Thank you for your inquiry.',\n",
" 'Thanks for keeping in touch.',\n",
" 'Thanks for getting in touch with me?']"
"data": {
"text/plain": [
"CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
" ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
" strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
" tokenizer=None, vocabulary=None)"
"data = count_vec_sample.transform(messages)"
"data": {
"text/plain": [
"matrix([[1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],\n",
" [0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1],\n",
" [0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0],\n",
" [0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0]])"
"data.todense() "
"count_vec = CountVectorizer()"
"data": {
"text/plain": [
"CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
" ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
" strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
" tokenizer=None, vocabulary=None)"
"from sklearn.model_selection import train_test_split\n",
"X_train, X_test, y_train, y_test = train_test_split(df['message'], \n",
" df['label'], \n",
" random_state=1)\n",
"count_vector = CountVectorizer()\n",
"X_train = count_vector.transform(X_train)"
"X_test = count_vector.transform(X_test)\n"
"data": {
"text/plain": [
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)"
"from sklearn.naive_bayes import MultinomialNB\n",
"naive_bayes = MultinomialNB()\n",
", y_train)"
"predictions = naive_bayes.predict(X_test)"
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy score: 0.9885139985642498\n",
"Precision score: 0.9720670391061452\n",
"Recall score: 0.9405405405405406\n",
"F1 score: 0.9560439560439562\n"
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
"print('Accuracy score: ', format(accuracy_score(y_test, predictions)))\n",
"print('Precision score: ', format(precision_score(y_test, predictions)))\n",
"print('Recall score: ', format(recall_score(y_test, predictions)))\n",
"print('F1 score: ', format(f1_score(y_test, predictions)))"
"## 評価関数について\n",
"Accuracy score、Recallなどを見ていますが、実際にはどの評価関数を重要視するかを決定する必要があります。\n",
"recall = (スパムであると正しく判定できたメール数)/(迷惑メールの全体の数)\n",
"precision = (スパムと判定したメールのうち実際にスパムであった数)/(フィルターがスパムと判定したメール数)\n",
