Skip to content

Instantly share code, notes, and snippets.

@riow1983
Created September 18, 2017 12:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save riow1983/ef54194aeb7548ff9a8715358dee8eb6 to your computer and use it in GitHub Desktop.
Save riow1983/ef54194aeb7548ff9a8715358dee8eb6 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 109,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.naive_bayes import GaussianNB\n",
"gnb = GaussianNB()"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn import preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = pd.DataFrame({\"患者氏名\":[\"Adam Smith\",\n",
"\"Napoleon Bonaparte\",\n",
"\"Adolf Hitler\",\n",
"\"Gabriel Lippmann\",\n",
"\"トーマス ベイズ\",\n",
"\"カール ハイド\",\n",
"\"マーク ザッカーバーグ\",\n",
"\"リー クワンユー\",\n",
"\"湯川 秀樹\",\n",
"\"朝永 振一郎\",\n",
"\"小林 誠\",\n",
"\"益川 敏英\",\n",
"\"毛 沢東\",\n",
"\"習 近平\",\n",
"\"金 日成\",\n",
"\"江 沢民\"]})"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df[\"患者姓\"] = df[\"患者氏名\"].apply(lambda x: x.split(\" \")[0])\n",
"df[\"患者名\"] = df[\"患者氏名\"].apply(lambda x: x.split(\" \")[1])"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名</th>\n",
" <th>患者姓</th>\n",
" <th>患者名</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Adam Smith</td>\n",
" <td>Adam</td>\n",
" <td>Smith</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Napoleon Bonaparte</td>\n",
" <td>Napoleon</td>\n",
" <td>Bonaparte</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Adolf Hitler</td>\n",
" <td>Adolf</td>\n",
" <td>Hitler</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Gabriel Lippmann</td>\n",
" <td>Gabriel</td>\n",
" <td>Lippmann</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>トーマス ベイズ</td>\n",
" <td>トーマス</td>\n",
" <td>ベイズ</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>カール ハイド</td>\n",
" <td>カール</td>\n",
" <td>ハイド</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>マーク ザッカーバーグ</td>\n",
" <td>マーク</td>\n",
" <td>ザッカーバーグ</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>リー クワンユー</td>\n",
" <td>リー</td>\n",
" <td>クワンユー</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>湯川 秀樹</td>\n",
" <td>湯川</td>\n",
" <td>秀樹</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>朝永 振一郎</td>\n",
" <td>朝永</td>\n",
" <td>振一郎</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>小林 誠</td>\n",
" <td>小林</td>\n",
" <td>誠</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>益川 敏英</td>\n",
" <td>益川</td>\n",
" <td>敏英</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>毛 沢東</td>\n",
" <td>毛</td>\n",
" <td>沢東</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>習 近平</td>\n",
" <td>習</td>\n",
" <td>近平</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>金 日成</td>\n",
" <td>金</td>\n",
" <td>日成</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>江 沢民</td>\n",
" <td>江</td>\n",
" <td>沢民</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名 患者姓 患者名\n",
"0 Adam Smith Adam Smith\n",
"1 Napoleon Bonaparte Napoleon Bonaparte\n",
"2 Adolf Hitler Adolf Hitler\n",
"3 Gabriel Lippmann Gabriel Lippmann\n",
"4 トーマス ベイズ トーマス ベイズ\n",
"5 カール ハイド カール ハイド\n",
"6 マーク ザッカーバーグ マーク ザッカーバーグ\n",
"7 リー クワンユー リー クワンユー\n",
"8 湯川 秀樹 湯川 秀樹\n",
"9 朝永 振一郎 朝永 振一郎\n",
"10 小林 誠 小林 誠\n",
"11 益川 敏英 益川 敏英\n",
"12 毛 沢東 毛 沢東\n",
"13 習 近平 習 近平\n",
"14 金 日成 金 日成\n",
"15 江 沢民 江 沢民"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"cv = CountVectorizer()"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"counts = cv.fit_transform(df[\"患者氏名\"].values)"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"counts_array = counts.toarray()"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#df[\"患者氏名_array\"] = df[\"患者氏名_cv\"].apply(lambda x: x.toarray())"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df[\"患者氏名文字数\"] = df[\"患者氏名\"].apply(lambda x: len(x))\n",
"df[\"患者姓文字数\"] = df[\"患者姓\"].apply(lambda x: len(x))\n",
"df[\"患者名文字数\"] = df[\"患者名\"].apply(lambda x: len(x))"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名</th>\n",
" <th>患者姓</th>\n",
" <th>患者名</th>\n",
" <th>患者氏名文字数</th>\n",
" <th>患者姓文字数</th>\n",
" <th>患者名文字数</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Adam Smith</td>\n",
" <td>Adam</td>\n",
" <td>Smith</td>\n",
" <td>10</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Napoleon Bonaparte</td>\n",
" <td>Napoleon</td>\n",
" <td>Bonaparte</td>\n",
" <td>18</td>\n",
" <td>8</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Adolf Hitler</td>\n",
" <td>Adolf</td>\n",
" <td>Hitler</td>\n",
" <td>12</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Gabriel Lippmann</td>\n",
" <td>Gabriel</td>\n",
" <td>Lippmann</td>\n",
" <td>16</td>\n",
" <td>7</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>トーマス ベイズ</td>\n",
" <td>トーマス</td>\n",
" <td>ベイズ</td>\n",
" <td>8</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>カール ハイド</td>\n",
" <td>カール</td>\n",
" <td>ハイド</td>\n",
" <td>7</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>マーク ザッカーバーグ</td>\n",
" <td>マーク</td>\n",
" <td>ザッカーバーグ</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>リー クワンユー</td>\n",
" <td>リー</td>\n",
" <td>クワンユー</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>湯川 秀樹</td>\n",
" <td>湯川</td>\n",
" <td>秀樹</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>朝永 振一郎</td>\n",
" <td>朝永</td>\n",
" <td>振一郎</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>小林 誠</td>\n",
" <td>小林</td>\n",
" <td>誠</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>益川 敏英</td>\n",
" <td>益川</td>\n",
" <td>敏英</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>毛 沢東</td>\n",
" <td>毛</td>\n",
" <td>沢東</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>習 近平</td>\n",
" <td>習</td>\n",
" <td>近平</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>金 日成</td>\n",
" <td>金</td>\n",
" <td>日成</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>江 沢民</td>\n",
" <td>江</td>\n",
" <td>沢民</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名 患者姓 患者名 患者氏名文字数 患者姓文字数 患者名文字数\n",
"0 Adam Smith Adam Smith 10 4 5\n",
"1 Napoleon Bonaparte Napoleon Bonaparte 18 8 9\n",
"2 Adolf Hitler Adolf Hitler 12 5 6\n",
"3 Gabriel Lippmann Gabriel Lippmann 16 7 8\n",
"4 トーマス ベイズ トーマス ベイズ 8 4 3\n",
"5 カール ハイド カール ハイド 7 3 3\n",
"6 マーク ザッカーバーグ マーク ザッカーバーグ 11 3 7\n",
"7 リー クワンユー リー クワンユー 8 2 5\n",
"8 湯川 秀樹 湯川 秀樹 5 2 2\n",
"9 朝永 振一郎 朝永 振一郎 6 2 3\n",
"10 小林 誠 小林 誠 4 2 1\n",
"11 益川 敏英 益川 敏英 5 2 2\n",
"12 毛 沢東 毛 沢東 4 1 2\n",
"13 習 近平 習 近平 4 1 2\n",
"14 金 日成 金 日成 4 1 2\n",
"15 江 沢民 江 沢民 4 1 2"
]
},
"execution_count": 122,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df[\"判定\"] = \"\""
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df.loc[:4, \"判定\"] = \"アルファベット外国人\"\n",
"df.loc[4:8, \"判定\"] = \"カタカナ外国人\"\n",
"df.loc[8:12, \"判定\"] = \"日本人\"\n",
"df.loc[12:, \"判定\"] = \"漢字外国人\""
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名</th>\n",
" <th>患者姓</th>\n",
" <th>患者名</th>\n",
" <th>患者氏名文字数</th>\n",
" <th>患者姓文字数</th>\n",
" <th>患者名文字数</th>\n",
" <th>判定</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Adam Smith</td>\n",
" <td>Adam</td>\n",
" <td>Smith</td>\n",
" <td>10</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>アルファベット外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Napoleon Bonaparte</td>\n",
" <td>Napoleon</td>\n",
" <td>Bonaparte</td>\n",
" <td>18</td>\n",
" <td>8</td>\n",
" <td>9</td>\n",
" <td>アルファベット外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Adolf Hitler</td>\n",
" <td>Adolf</td>\n",
" <td>Hitler</td>\n",
" <td>12</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>アルファベット外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Gabriel Lippmann</td>\n",
" <td>Gabriel</td>\n",
" <td>Lippmann</td>\n",
" <td>16</td>\n",
" <td>7</td>\n",
" <td>8</td>\n",
" <td>アルファベット外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>トーマス ベイズ</td>\n",
" <td>トーマス</td>\n",
" <td>ベイズ</td>\n",
" <td>8</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>カタカナ外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>カール ハイド</td>\n",
" <td>カール</td>\n",
" <td>ハイド</td>\n",
" <td>7</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>カタカナ外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>マーク ザッカーバーグ</td>\n",
" <td>マーク</td>\n",
" <td>ザッカーバーグ</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>カタカナ外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>リー クワンユー</td>\n",
" <td>リー</td>\n",
" <td>クワンユー</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>カタカナ外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>湯川 秀樹</td>\n",
" <td>湯川</td>\n",
" <td>秀樹</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>朝永 振一郎</td>\n",
" <td>朝永</td>\n",
" <td>振一郎</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>小林 誠</td>\n",
" <td>小林</td>\n",
" <td>誠</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>益川 敏英</td>\n",
" <td>益川</td>\n",
" <td>敏英</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>毛 沢東</td>\n",
" <td>毛</td>\n",
" <td>沢東</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>習 近平</td>\n",
" <td>習</td>\n",
" <td>近平</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>金 日成</td>\n",
" <td>金</td>\n",
" <td>日成</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>江 沢民</td>\n",
" <td>江</td>\n",
" <td>沢民</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名 患者姓 患者名 患者氏名文字数 患者姓文字数 患者名文字数 \\\n",
"0 Adam Smith Adam Smith 10 4 5 \n",
"1 Napoleon Bonaparte Napoleon Bonaparte 18 8 9 \n",
"2 Adolf Hitler Adolf Hitler 12 5 6 \n",
"3 Gabriel Lippmann Gabriel Lippmann 16 7 8 \n",
"4 トーマス ベイズ トーマス ベイズ 8 4 3 \n",
"5 カール ハイド カール ハイド 7 3 3 \n",
"6 マーク ザッカーバーグ マーク ザッカーバーグ 11 3 7 \n",
"7 リー クワンユー リー クワンユー 8 2 5 \n",
"8 湯川 秀樹 湯川 秀樹 5 2 2 \n",
"9 朝永 振一郎 朝永 振一郎 6 2 3 \n",
"10 小林 誠 小林 誠 4 2 1 \n",
"11 益川 敏英 益川 敏英 5 2 2 \n",
"12 毛 沢東 毛 沢東 4 1 2 \n",
"13 習 近平 習 近平 4 1 2 \n",
"14 金 日成 金 日成 4 1 2 \n",
"15 江 沢民 江 沢民 4 1 2 \n",
"\n",
" 判定 \n",
"0 アルファベット外国人 \n",
"1 アルファベット外国人 \n",
"2 アルファベット外国人 \n",
"3 アルファベット外国人 \n",
"4 カタカナ外国人 \n",
"5 カタカナ外国人 \n",
"6 カタカナ外国人 \n",
"7 カタカナ外国人 \n",
"8 日本人 \n",
"9 日本人 \n",
"10 日本人 \n",
"11 日本人 \n",
"12 漢字外国人 \n",
"13 漢字外国人 \n",
"14 漢字外国人 \n",
"15 漢字外国人 "
]
},
"execution_count": 125,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"le = preprocessing.LabelEncoder()"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#df[\"患者氏名_cat\"] = le.fit_transform(df[\"患者氏名\"])\n",
"#df[\"患者姓_cat\"] = le.fit_transform(df[\"患者姓\"])\n",
"#df[\"患者名_cat\"] = le.fit_transform(df[\"患者名\"])\n",
"df[\"判定_cat\"] = le.fit_transform(df[\"判定\"])"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名</th>\n",
" <th>患者姓</th>\n",
" <th>患者名</th>\n",
" <th>患者氏名文字数</th>\n",
" <th>患者姓文字数</th>\n",
" <th>患者名文字数</th>\n",
" <th>判定</th>\n",
" <th>判定_cat</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Adam Smith</td>\n",
" <td>Adam</td>\n",
" <td>Smith</td>\n",
" <td>10</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>アルファベット外国人</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Napoleon Bonaparte</td>\n",
" <td>Napoleon</td>\n",
" <td>Bonaparte</td>\n",
" <td>18</td>\n",
" <td>8</td>\n",
" <td>9</td>\n",
" <td>アルファベット外国人</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Adolf Hitler</td>\n",
" <td>Adolf</td>\n",
" <td>Hitler</td>\n",
" <td>12</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>アルファベット外国人</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Gabriel Lippmann</td>\n",
" <td>Gabriel</td>\n",
" <td>Lippmann</td>\n",
" <td>16</td>\n",
" <td>7</td>\n",
" <td>8</td>\n",
" <td>アルファベット外国人</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>トーマス ベイズ</td>\n",
" <td>トーマス</td>\n",
" <td>ベイズ</td>\n",
" <td>8</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>カタカナ外国人</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>カール ハイド</td>\n",
" <td>カール</td>\n",
" <td>ハイド</td>\n",
" <td>7</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>カタカナ外国人</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>マーク ザッカーバーグ</td>\n",
" <td>マーク</td>\n",
" <td>ザッカーバーグ</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>カタカナ外国人</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>リー クワンユー</td>\n",
" <td>リー</td>\n",
" <td>クワンユー</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>カタカナ外国人</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>湯川 秀樹</td>\n",
" <td>湯川</td>\n",
" <td>秀樹</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>朝永 振一郎</td>\n",
" <td>朝永</td>\n",
" <td>振一郎</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>日本人</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>小林 誠</td>\n",
" <td>小林</td>\n",
" <td>誠</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>日本人</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>益川 敏英</td>\n",
" <td>益川</td>\n",
" <td>敏英</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>毛 沢東</td>\n",
" <td>毛</td>\n",
" <td>沢東</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>習 近平</td>\n",
" <td>習</td>\n",
" <td>近平</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>金 日成</td>\n",
" <td>金</td>\n",
" <td>日成</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>江 沢民</td>\n",
" <td>江</td>\n",
" <td>沢民</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名 患者姓 患者名 患者氏名文字数 患者姓文字数 患者名文字数 \\\n",
"0 Adam Smith Adam Smith 10 4 5 \n",
"1 Napoleon Bonaparte Napoleon Bonaparte 18 8 9 \n",
"2 Adolf Hitler Adolf Hitler 12 5 6 \n",
"3 Gabriel Lippmann Gabriel Lippmann 16 7 8 \n",
"4 トーマス ベイズ トーマス ベイズ 8 4 3 \n",
"5 カール ハイド カール ハイド 7 3 3 \n",
"6 マーク ザッカーバーグ マーク ザッカーバーグ 11 3 7 \n",
"7 リー クワンユー リー クワンユー 8 2 5 \n",
"8 湯川 秀樹 湯川 秀樹 5 2 2 \n",
"9 朝永 振一郎 朝永 振一郎 6 2 3 \n",
"10 小林 誠 小林 誠 4 2 1 \n",
"11 益川 敏英 益川 敏英 5 2 2 \n",
"12 毛 沢東 毛 沢東 4 1 2 \n",
"13 習 近平 習 近平 4 1 2 \n",
"14 金 日成 金 日成 4 1 2 \n",
"15 江 沢民 江 沢民 4 1 2 \n",
"\n",
" 判定 判定_cat \n",
"0 アルファベット外国人 0 \n",
"1 アルファベット外国人 0 \n",
"2 アルファベット外国人 0 \n",
"3 アルファベット外国人 0 \n",
"4 カタカナ外国人 1 \n",
"5 カタカナ外国人 1 \n",
"6 カタカナ外国人 1 \n",
"7 カタカナ外国人 1 \n",
"8 日本人 2 \n",
"9 日本人 2 \n",
"10 日本人 2 \n",
"11 日本人 2 \n",
"12 漢字外国人 3 \n",
"13 漢字外国人 3 \n",
"14 漢字外国人 3 \n",
"15 漢字外国人 3 "
]
},
"execution_count": 128,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = df[df.columns[df.columns.str.contains(\"cat\")|\n",
" df.columns.str.contains(\"文字数\")|\n",
" df.columns.str.contains(\"array\")]]"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名文字数</th>\n",
" <th>患者姓文字数</th>\n",
" <th>患者名文字数</th>\n",
" <th>判定_cat</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>18</td>\n",
" <td>8</td>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>12</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>16</td>\n",
" <td>7</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>8</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>7</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名文字数 患者姓文字数 患者名文字数 判定_cat\n",
"0 10 4 5 0\n",
"1 18 8 9 0\n",
"2 12 5 6 0\n",
"3 16 7 8 0\n",
"4 8 4 3 1\n",
"5 7 3 3 1\n",
"6 11 3 7 1\n",
"7 8 2 5 1\n",
"8 5 2 2 2\n",
"9 6 2 3 2\n",
"10 4 2 1 2\n",
"11 5 2 2 2\n",
"12 4 1 2 3\n",
"13 4 1 2 3\n",
"14 4 1 2 3\n",
"15 4 1 2 3"
]
},
"execution_count": 130,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"tmp = df.iloc[:, :-1].values"
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[10, 4, 5],\n",
" [18, 8, 9],\n",
" [12, 5, 6],\n",
" [16, 7, 8],\n",
" [ 8, 4, 3],\n",
" [ 7, 3, 3],\n",
" [11, 3, 7],\n",
" [ 8, 2, 5],\n",
" [ 5, 2, 2],\n",
" [ 6, 2, 3],\n",
" [ 4, 2, 1],\n",
" [ 5, 2, 2],\n",
" [ 4, 1, 2],\n",
" [ 4, 1, 2],\n",
" [ 4, 1, 2],\n",
" [ 4, 1, 2]])"
]
},
"execution_count": 164,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tmp"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X = np.array([np.append(x,y) for x,y in zip(counts_array, tmp)])"
]
},
{
"cell_type": "code",
"execution_count": 167,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#X = np.append(df.iloc[:, :-1].values, counts_array)\n",
"y = df.iloc[:, -1].values"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 4, 5],\n",
" [ 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 8, 9],\n",
" [ 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 5, 6],\n",
" [ 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 7, 8],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 4, 3],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 3, 3],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 3, 7],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 2, 5],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 5, 2, 2],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 6, 2, 3],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 1],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 5, 2, 2],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 1, 2],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 1, 2],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 4, 1, 2],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 1, 2]], dtype=int64)"
]
},
"execution_count": 168,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X"
]
},
{
"cell_type": "code",
"execution_count": 169,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3])"
]
},
"execution_count": 169,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"GaussianNB(priors=None)"
]
},
"execution_count": 170,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gnb.fit(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"tf = pd.DataFrame({\"患者氏名\": [\n",
"\"Tom Hanks\",\n",
"\"Robert De Niro\",\n",
"\"Gen Hoshino\",\n",
"\"金 正男\",\n",
"\"朴 璐美\",\n",
"\"李 小龍\",\n",
"\"林 彪\",\n",
"\"古歩道 ベンジャミン\",\n",
"\"キム イルソン\",\n",
"\"山下 奉文\",\n",
"\"宮沢 賢治\",\n",
"\"徳川 家康\",\n",
"\"井浦 新\",\n",
"\"窪塚 洋介\",\n",
"\"伊藤 博文\",\n",
"\"近衛 文麿\"\n",
"]\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 172,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Tom Hanks</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Robert De Niro</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Gen Hoshino</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>金 正男</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>朴 璐美</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>李 小龍</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>林 彪</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>古歩道 ベンジャミン</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>キム イルソン</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>山下 奉文</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>宮沢 賢治</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>徳川 家康</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>井浦 新</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>窪塚 洋介</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>伊藤 博文</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>近衛 文麿</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名\n",
"0 Tom Hanks\n",
"1 Robert De Niro\n",
"2 Gen Hoshino\n",
"3 金 正男\n",
"4 朴 璐美\n",
"5 李 小龍\n",
"6 林 彪\n",
"7 古歩道 ベンジャミン\n",
"8 キム イルソン\n",
"9 山下 奉文\n",
"10 宮沢 賢治\n",
"11 徳川 家康\n",
"12 井浦 新\n",
"13 窪塚 洋介\n",
"14 伊藤 博文\n",
"15 近衛 文麿"
]
},
"execution_count": 172,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tf"
]
},
{
"cell_type": "code",
"execution_count": 173,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"counts2 = cv.fit_transform(tf[\"患者氏名\"].values)\n",
"counts2_array = counts2.toarray()"
]
},
{
"cell_type": "code",
"execution_count": 174,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0],\n",
" [1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0],\n",
" [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,\n",
" 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 1, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,\n",
" 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 1, 0],\n",
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,\n",
" 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 1, 0, 1, 0, 0],\n",
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,\n",
" 0, 0, 0, 0, 1]], dtype=int64)"
]
},
"execution_count": 174,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"counts2_array"
]
},
{
"cell_type": "code",
"execution_count": 175,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"tf[\"患者姓\"] = tf[\"患者氏名\"].apply(lambda x: x.split(\" \")[0])\n",
"tf[\"患者名\"] = tf[\"患者氏名\"].apply(lambda x: x.split(\" \")[1])\n",
"\n",
"tf[\"患者氏名文字数\"] = tf[\"患者氏名\"].apply(lambda x: len(x))\n",
"tf[\"患者姓文字数\"] = tf[\"患者姓\"].apply(lambda x: len(x))\n",
"tf[\"患者名文字数\"] = tf[\"患者名\"].apply(lambda x: len(x))"
]
},
{
"cell_type": "code",
"execution_count": 176,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名</th>\n",
" <th>患者姓</th>\n",
" <th>患者名</th>\n",
" <th>患者氏名文字数</th>\n",
" <th>患者姓文字数</th>\n",
" <th>患者名文字数</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Tom Hanks</td>\n",
" <td>Tom</td>\n",
" <td>Hanks</td>\n",
" <td>9</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Robert De Niro</td>\n",
" <td>Robert</td>\n",
" <td>De</td>\n",
" <td>14</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Gen Hoshino</td>\n",
" <td>Gen</td>\n",
" <td>Hoshino</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>金 正男</td>\n",
" <td>金</td>\n",
" <td>正男</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>朴 璐美</td>\n",
" <td>朴</td>\n",
" <td>璐美</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>李 小龍</td>\n",
" <td>李</td>\n",
" <td>小龍</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>林 彪</td>\n",
" <td>林</td>\n",
" <td>彪</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>古歩道 ベンジャミン</td>\n",
" <td>古歩道</td>\n",
" <td>ベンジャミン</td>\n",
" <td>10</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>キム イルソン</td>\n",
" <td>キム</td>\n",
" <td>イルソン</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>山下 奉文</td>\n",
" <td>山下</td>\n",
" <td>奉文</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>宮沢 賢治</td>\n",
" <td>宮沢</td>\n",
" <td>賢治</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>徳川 家康</td>\n",
" <td>徳川</td>\n",
" <td>家康</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>井浦 新</td>\n",
" <td>井浦</td>\n",
" <td>新</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>窪塚 洋介</td>\n",
" <td>窪塚</td>\n",
" <td>洋介</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>伊藤 博文</td>\n",
" <td>伊藤</td>\n",
" <td>博文</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>近衛 文麿</td>\n",
" <td>近衛</td>\n",
" <td>文麿</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名 患者姓 患者名 患者氏名文字数 患者姓文字数 患者名文字数\n",
"0 Tom Hanks Tom Hanks 9 3 5\n",
"1 Robert De Niro Robert De 14 6 2\n",
"2 Gen Hoshino Gen Hoshino 11 3 7\n",
"3 金 正男 金 正男 4 1 2\n",
"4 朴 璐美 朴 璐美 4 1 2\n",
"5 李 小龍 李 小龍 4 1 2\n",
"6 林 彪 林 彪 3 1 1\n",
"7 古歩道 ベンジャミン 古歩道 ベンジャミン 10 3 6\n",
"8 キム イルソン キム イルソン 7 2 4\n",
"9 山下 奉文 山下 奉文 5 2 2\n",
"10 宮沢 賢治 宮沢 賢治 5 2 2\n",
"11 徳川 家康 徳川 家康 5 2 2\n",
"12 井浦 新 井浦 新 4 2 1\n",
"13 窪塚 洋介 窪塚 洋介 5 2 2\n",
"14 伊藤 博文 伊藤 博文 5 2 2\n",
"15 近衛 文麿 近衛 文麿 5 2 2"
]
},
"execution_count": 176,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tf"
]
},
{
"cell_type": "code",
"execution_count": 177,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"tf.loc[:3, \"判定\"] = \"アルファベット外国人\"\n",
"tf.loc[3:7, \"判定\"] = \"漢字外国人\"\n",
"tf.loc[7:9, \"判定\"] = \"カタカナ外国人\"\n",
"tf.loc[9:, \"判定\"] = \"日本人\""
]
},
{
"cell_type": "code",
"execution_count": 178,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名</th>\n",
" <th>患者姓</th>\n",
" <th>患者名</th>\n",
" <th>患者氏名文字数</th>\n",
" <th>患者姓文字数</th>\n",
" <th>患者名文字数</th>\n",
" <th>判定</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Tom Hanks</td>\n",
" <td>Tom</td>\n",
" <td>Hanks</td>\n",
" <td>9</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" <td>アルファベット外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Robert De Niro</td>\n",
" <td>Robert</td>\n",
" <td>De</td>\n",
" <td>14</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>アルファベット外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Gen Hoshino</td>\n",
" <td>Gen</td>\n",
" <td>Hoshino</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>アルファベット外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>金 正男</td>\n",
" <td>金</td>\n",
" <td>正男</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>朴 璐美</td>\n",
" <td>朴</td>\n",
" <td>璐美</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>李 小龍</td>\n",
" <td>李</td>\n",
" <td>小龍</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>林 彪</td>\n",
" <td>林</td>\n",
" <td>彪</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>漢字外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>古歩道 ベンジャミン</td>\n",
" <td>古歩道</td>\n",
" <td>ベンジャミン</td>\n",
" <td>10</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>カタカナ外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>キム イルソン</td>\n",
" <td>キム</td>\n",
" <td>イルソン</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>カタカナ外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>山下 奉文</td>\n",
" <td>山下</td>\n",
" <td>奉文</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>宮沢 賢治</td>\n",
" <td>宮沢</td>\n",
" <td>賢治</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>徳川 家康</td>\n",
" <td>徳川</td>\n",
" <td>家康</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>井浦 新</td>\n",
" <td>井浦</td>\n",
" <td>新</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>窪塚 洋介</td>\n",
" <td>窪塚</td>\n",
" <td>洋介</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>伊藤 博文</td>\n",
" <td>伊藤</td>\n",
" <td>博文</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>近衛 文麿</td>\n",
" <td>近衛</td>\n",
" <td>文麿</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名 患者姓 患者名 患者氏名文字数 患者姓文字数 患者名文字数 判定\n",
"0 Tom Hanks Tom Hanks 9 3 5 アルファベット外国人\n",
"1 Robert De Niro Robert De 14 6 2 アルファベット外国人\n",
"2 Gen Hoshino Gen Hoshino 11 3 7 アルファベット外国人\n",
"3 金 正男 金 正男 4 1 2 漢字外国人\n",
"4 朴 璐美 朴 璐美 4 1 2 漢字外国人\n",
"5 李 小龍 李 小龍 4 1 2 漢字外国人\n",
"6 林 彪 林 彪 3 1 1 漢字外国人\n",
"7 古歩道 ベンジャミン 古歩道 ベンジャミン 10 3 6 カタカナ外国人\n",
"8 キム イルソン キム イルソン 7 2 4 カタカナ外国人\n",
"9 山下 奉文 山下 奉文 5 2 2 日本人\n",
"10 宮沢 賢治 宮沢 賢治 5 2 2 日本人\n",
"11 徳川 家康 徳川 家康 5 2 2 日本人\n",
"12 井浦 新 井浦 新 4 2 1 日本人\n",
"13 窪塚 洋介 窪塚 洋介 5 2 2 日本人\n",
"14 伊藤 博文 伊藤 博文 5 2 2 日本人\n",
"15 近衛 文麿 近衛 文麿 5 2 2 日本人"
]
},
"execution_count": 178,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tf"
]
},
{
"cell_type": "code",
"execution_count": 179,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#tf[\"患者氏名_cat\"] = le.fit_transform(tf[\"患者氏名\"])\n",
"#tf[\"患者姓_cat\"] = le.fit_transform(tf[\"患者姓\"])\n",
"#tf[\"患者名_cat\"] = le.fit_transform(tf[\"患者名\"])\n",
"tf[\"判定_cat\"] = le.fit_transform(tf[\"判定\"])"
]
},
{
"cell_type": "code",
"execution_count": 180,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名</th>\n",
" <th>患者姓</th>\n",
" <th>患者名</th>\n",
" <th>患者氏名文字数</th>\n",
" <th>患者姓文字数</th>\n",
" <th>患者名文字数</th>\n",
" <th>判定</th>\n",
" <th>判定_cat</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Tom Hanks</td>\n",
" <td>Tom</td>\n",
" <td>Hanks</td>\n",
" <td>9</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" <td>アルファベット外国人</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Robert De Niro</td>\n",
" <td>Robert</td>\n",
" <td>De</td>\n",
" <td>14</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>アルファベット外国人</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Gen Hoshino</td>\n",
" <td>Gen</td>\n",
" <td>Hoshino</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>アルファベット外国人</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>金 正男</td>\n",
" <td>金</td>\n",
" <td>正男</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>朴 璐美</td>\n",
" <td>朴</td>\n",
" <td>璐美</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>李 小龍</td>\n",
" <td>李</td>\n",
" <td>小龍</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>林 彪</td>\n",
" <td>林</td>\n",
" <td>彪</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>漢字外国人</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>古歩道 ベンジャミン</td>\n",
" <td>古歩道</td>\n",
" <td>ベンジャミン</td>\n",
" <td>10</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>カタカナ外国人</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>キム イルソン</td>\n",
" <td>キム</td>\n",
" <td>イルソン</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>カタカナ外国人</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>山下 奉文</td>\n",
" <td>山下</td>\n",
" <td>奉文</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>宮沢 賢治</td>\n",
" <td>宮沢</td>\n",
" <td>賢治</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>徳川 家康</td>\n",
" <td>徳川</td>\n",
" <td>家康</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>井浦 新</td>\n",
" <td>井浦</td>\n",
" <td>新</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>日本人</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>窪塚 洋介</td>\n",
" <td>窪塚</td>\n",
" <td>洋介</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>伊藤 博文</td>\n",
" <td>伊藤</td>\n",
" <td>博文</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>近衛 文麿</td>\n",
" <td>近衛</td>\n",
" <td>文麿</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名 患者姓 患者名 患者氏名文字数 患者姓文字数 患者名文字数 判定 \\\n",
"0 Tom Hanks Tom Hanks 9 3 5 アルファベット外国人 \n",
"1 Robert De Niro Robert De 14 6 2 アルファベット外国人 \n",
"2 Gen Hoshino Gen Hoshino 11 3 7 アルファベット外国人 \n",
"3 金 正男 金 正男 4 1 2 漢字外国人 \n",
"4 朴 璐美 朴 璐美 4 1 2 漢字外国人 \n",
"5 李 小龍 李 小龍 4 1 2 漢字外国人 \n",
"6 林 彪 林 彪 3 1 1 漢字外国人 \n",
"7 古歩道 ベンジャミン 古歩道 ベンジャミン 10 3 6 カタカナ外国人 \n",
"8 キム イルソン キム イルソン 7 2 4 カタカナ外国人 \n",
"9 山下 奉文 山下 奉文 5 2 2 日本人 \n",
"10 宮沢 賢治 宮沢 賢治 5 2 2 日本人 \n",
"11 徳川 家康 徳川 家康 5 2 2 日本人 \n",
"12 井浦 新 井浦 新 4 2 1 日本人 \n",
"13 窪塚 洋介 窪塚 洋介 5 2 2 日本人 \n",
"14 伊藤 博文 伊藤 博文 5 2 2 日本人 \n",
"15 近衛 文麿 近衛 文麿 5 2 2 日本人 \n",
"\n",
" 判定_cat \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 3 \n",
"4 3 \n",
"5 3 \n",
"6 3 \n",
"7 1 \n",
"8 1 \n",
"9 2 \n",
"10 2 \n",
"11 2 \n",
"12 2 \n",
"13 2 \n",
"14 2 \n",
"15 2 "
]
},
"execution_count": 180,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tf"
]
},
{
"cell_type": "code",
"execution_count": 181,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"tf = tf[tf.columns[tf.columns.str.contains(\"cat\")|\n",
" tf.columns.str.contains(\"文字数\")]]"
]
},
{
"cell_type": "code",
"execution_count": 182,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名文字数</th>\n",
" <th>患者姓文字数</th>\n",
" <th>患者名文字数</th>\n",
" <th>判定_cat</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>9</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>14</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>10</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名文字数 患者姓文字数 患者名文字数 判定_cat\n",
"0 9 3 5 0\n",
"1 14 6 2 0\n",
"2 11 3 7 0\n",
"3 4 1 2 3\n",
"4 4 1 2 3\n",
"5 4 1 2 3\n",
"6 3 1 1 3\n",
"7 10 3 6 1\n",
"8 7 2 4 1\n",
"9 5 2 2 2\n",
"10 5 2 2 2\n",
"11 5 2 2 2\n",
"12 4 2 1 2\n",
"13 5 2 2 2\n",
"14 5 2 2 2\n",
"15 5 2 2 2"
]
},
"execution_count": 182,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tf"
]
},
{
"cell_type": "code",
"execution_count": 183,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"tmp_test = tf.iloc[:, :-1].values"
]
},
{
"cell_type": "code",
"execution_count": 184,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 9, 3, 5],\n",
" [14, 6, 2],\n",
" [11, 3, 7],\n",
" [ 4, 1, 2],\n",
" [ 4, 1, 2],\n",
" [ 4, 1, 2],\n",
" [ 3, 1, 1],\n",
" [10, 3, 6],\n",
" [ 7, 2, 4],\n",
" [ 5, 2, 2],\n",
" [ 5, 2, 2],\n",
" [ 5, 2, 2],\n",
" [ 4, 2, 1],\n",
" [ 5, 2, 2],\n",
" [ 5, 2, 2],\n",
" [ 5, 2, 2]])"
]
},
"execution_count": 184,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tmp_test"
]
},
{
"cell_type": "code",
"execution_count": 185,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X_test = np.array([np.append(x,y) for x,y in zip(counts2_array, tmp_test)])"
]
},
{
"cell_type": "code",
"execution_count": 186,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#X_test = tf.iloc[:, :-1].values\n",
"#X_test = counts2_array\n",
"y_test = tf.iloc[:, -1].values"
]
},
{
"cell_type": "code",
"execution_count": 187,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 3, 5],\n",
" [ 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 6, 2],\n",
" [ 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 3, 7],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 1, 2],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4, 1, 2],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 2],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 1],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 3, 6],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 2, 4],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,\n",
" 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 5, 2, 2],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 5, 2, 2],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,\n",
" 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 5, 2, 2],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 1],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 5, 2, 2],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 2, 2],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 5, 2, 2]], dtype=int64)"
]
},
"execution_count": 187,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test"
]
},
{
"cell_type": "code",
"execution_count": 188,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 0, 0, 3, 3, 3, 3, 1, 1, 2, 2, 2, 2, 2, 2, 2])"
]
},
"execution_count": 188,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_test"
]
},
{
"cell_type": "code",
"execution_count": 189,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 0, 0, 3, 3, 3, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2])"
]
},
"execution_count": 189,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gnb.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 190,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.8125"
]
},
"execution_count": 190,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gnb.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment