Skip to content

Instantly share code, notes, and snippets.

@riow1983
Created September 18, 2017 05:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save riow1983/3dae6f9f674e07998de93862009a8388 to your computer and use it in GitHub Desktop.
Save riow1983/3dae6f9f674e07998de93862009a8388 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from functools import reduce"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.naive_bayes import GaussianNB\n",
"gnb = GaussianNB()"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn import preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = pd.DataFrame({\"患者氏名\":[\"Adam Smith\",\n",
"\"Napoleon Bonaparte\",\n",
"\"Adolf Hitler\",\n",
"\"Gabriel Lippmann\",\n",
"\"トーマス ベイズ\",\n",
"\"カール ハイド\",\n",
"\"マーク ザッカーバーグ\",\n",
"\"リー クワンユー\",\n",
"\"湯川 秀樹\",\n",
"\"朝永 振一郎\",\n",
"\"小林 誠\",\n",
"\"益川 敏英\",\n",
"\"毛 沢東\",\n",
"\"習 近平\",\n",
"\"金 日成\",\n",
"\"江 沢民\"]})"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df[\"患者姓\"] = df[\"患者氏名\"].apply(lambda x: x.split(\" \")[0])\n",
"df[\"患者名\"] = df[\"患者氏名\"].apply(lambda x: x.split(\" \")[1])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名</th>\n",
" <th>患者姓</th>\n",
" <th>患者名</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Adam Smith</td>\n",
" <td>Adam</td>\n",
" <td>Smith</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Napoleon Bonaparte</td>\n",
" <td>Napoleon</td>\n",
" <td>Bonaparte</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Adolf Hitler</td>\n",
" <td>Adolf</td>\n",
" <td>Hitler</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Gabriel Lippmann</td>\n",
" <td>Gabriel</td>\n",
" <td>Lippmann</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>トーマス ベイズ</td>\n",
" <td>トーマス</td>\n",
" <td>ベイズ</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>カール ハイド</td>\n",
" <td>カール</td>\n",
" <td>ハイド</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>マーク ザッカーバーグ</td>\n",
" <td>マーク</td>\n",
" <td>ザッカーバーグ</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>リー クワンユー</td>\n",
" <td>リー</td>\n",
" <td>クワンユー</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>湯川 秀樹</td>\n",
" <td>湯川</td>\n",
" <td>秀樹</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>朝永 振一郎</td>\n",
" <td>朝永</td>\n",
" <td>振一郎</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>小林 誠</td>\n",
" <td>小林</td>\n",
" <td>誠</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>益川 敏英</td>\n",
" <td>益川</td>\n",
" <td>敏英</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>毛 沢東</td>\n",
" <td>毛</td>\n",
" <td>沢東</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>習 近平</td>\n",
" <td>習</td>\n",
" <td>近平</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>金 日成</td>\n",
" <td>金</td>\n",
" <td>日成</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>江 沢民</td>\n",
" <td>江</td>\n",
" <td>沢民</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名 患者姓 患者名\n",
"0 Adam Smith Adam Smith\n",
"1 Napoleon Bonaparte Napoleon Bonaparte\n",
"2 Adolf Hitler Adolf Hitler\n",
"3 Gabriel Lippmann Gabriel Lippmann\n",
"4 トーマス ベイズ トーマス ベイズ\n",
"5 カール ハイド カール ハイド\n",
"6 マーク ザッカーバーグ マーク ザッカーバーグ\n",
"7 リー クワンユー リー クワンユー\n",
"8 湯川 秀樹 湯川 秀樹\n",
"9 朝永 振一郎 朝永 振一郎\n",
"10 小林 誠 小林 誠\n",
"11 益川 敏英 益川 敏英\n",
"12 毛 沢東 毛 沢東\n",
"13 習 近平 習 近平\n",
"14 金 日成 金 日成\n",
"15 江 沢民 江 沢民"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df[\"患者氏名文字数\"] = df[\"患者氏名\"].apply(lambda x: len(x))\n",
"df[\"患者姓文字数\"] = df[\"患者姓\"].apply(lambda x: len(x))\n",
"df[\"患者名文字数\"] = df[\"患者名\"].apply(lambda x: len(x))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df[\"判定\"] = \"\""
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" from ipykernel import kernelapp as app\n",
"/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
"/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
"/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
]
}
],
"source": [
"df1 = df[:4]\n",
"df1[\"判定\"] = \"アルファベット外国人\"\n",
"\n",
"df2 = df[4:8]\n",
"df2[\"判定\"] = \"カタカナ外国人\"\n",
"\n",
"df3 = df[8:12]\n",
"df3[\"判定\"] = \"日本人\"\n",
"\n",
"df4 = df[12:]\n",
"df4[\"判定\"] = \"漢字外国人\"\n",
"\n",
"dflist = [df1,df2,df3,df4]\n",
"\n",
"dfc = reduce((lambda x,y: pd.concat([x,y])), dflist)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名</th>\n",
" <th>患者姓</th>\n",
" <th>患者名</th>\n",
" <th>患者氏名文字数</th>\n",
" <th>患者姓文字数</th>\n",
" <th>患者名文字数</th>\n",
" <th>判定</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Adam Smith</td>\n",
" <td>Adam</td>\n",
" <td>Smith</td>\n",
" <td>10</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>アルファベット外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Napoleon Bonaparte</td>\n",
" <td>Napoleon</td>\n",
" <td>Bonaparte</td>\n",
" <td>18</td>\n",
" <td>8</td>\n",
" <td>9</td>\n",
" <td>アルファベット外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Adolf Hitler</td>\n",
" <td>Adolf</td>\n",
" <td>Hitler</td>\n",
" <td>12</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>アルファベット外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Gabriel Lippmann</td>\n",
" <td>Gabriel</td>\n",
" <td>Lippmann</td>\n",
" <td>16</td>\n",
" <td>7</td>\n",
" <td>8</td>\n",
" <td>アルファベット外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>トーマス ベイズ</td>\n",
" <td>トーマス</td>\n",
" <td>ベイズ</td>\n",
" <td>8</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>カタカナ外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>カール ハイド</td>\n",
" <td>カール</td>\n",
" <td>ハイド</td>\n",
" <td>7</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>カタカナ外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>マーク ザッカーバーグ</td>\n",
" <td>マーク</td>\n",
" <td>ザッカーバーグ</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>カタカナ外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>リー クワンユー</td>\n",
" <td>リー</td>\n",
" <td>クワンユー</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>カタカナ外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>湯川 秀樹</td>\n",
" <td>湯川</td>\n",
" <td>秀樹</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>朝永 振一郎</td>\n",
" <td>朝永</td>\n",
" <td>振一郎</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>小林 誠</td>\n",
" <td>小林</td>\n",
" <td>誠</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>益川 敏英</td>\n",
" <td>益川</td>\n",
" <td>敏英</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>毛 沢東</td>\n",
" <td>毛</td>\n",
" <td>沢東</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>習 近平</td>\n",
" <td>習</td>\n",
" <td>近平</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>金 日成</td>\n",
" <td>金</td>\n",
" <td>日成</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>江 沢民</td>\n",
" <td>江</td>\n",
" <td>沢民</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名 患者姓 患者名 患者氏名文字数 患者姓文字数 患者名文字数 \\\n",
"0 Adam Smith Adam Smith 10 4 5 \n",
"1 Napoleon Bonaparte Napoleon Bonaparte 18 8 9 \n",
"2 Adolf Hitler Adolf Hitler 12 5 6 \n",
"3 Gabriel Lippmann Gabriel Lippmann 16 7 8 \n",
"4 トーマス ベイズ トーマス ベイズ 8 4 3 \n",
"5 カール ハイド カール ハイド 7 3 3 \n",
"6 マーク ザッカーバーグ マーク ザッカーバーグ 11 3 7 \n",
"7 リー クワンユー リー クワンユー 8 2 5 \n",
"8 湯川 秀樹 湯川 秀樹 5 2 2 \n",
"9 朝永 振一郎 朝永 振一郎 6 2 3 \n",
"10 小林 誠 小林 誠 4 2 1 \n",
"11 益川 敏英 益川 敏英 5 2 2 \n",
"12 毛 沢東 毛 沢東 4 1 2 \n",
"13 習 近平 習 近平 4 1 2 \n",
"14 金 日成 金 日成 4 1 2 \n",
"15 江 沢民 江 沢民 4 1 2 \n",
"\n",
" 判定 \n",
"0 アルファベット外国人 \n",
"1 アルファベット外国人 \n",
"2 アルファベット外国人 \n",
"3 アルファベット外国人 \n",
"4 カタカナ外国人 \n",
"5 カタカナ外国人 \n",
"6 カタカナ外国人 \n",
"7 カタカナ外国人 \n",
"8 日本人 \n",
"9 日本人 \n",
"10 日本人 \n",
"11 日本人 \n",
"12 漢字外国人 \n",
"13 漢字外国人 \n",
"14 漢字外国人 \n",
"15 漢字外国人 "
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfc"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"le = preprocessing.LabelEncoder()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"dfc[\"患者氏名_cat\"] = le.fit_transform(dfc[\"患者氏名\"])\n",
"dfc[\"患者姓_cat\"] = le.fit_transform(dfc[\"患者姓\"])\n",
"dfc[\"患者名_cat\"] = le.fit_transform(dfc[\"患者名\"])\n",
"dfc[\"判定_cat\"] = le.fit_transform(dfc[\"判定\"])"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"dfd = dfc[dfc.columns[dfc.columns.str.contains(\"cat\")|dfc.columns.str.contains(\"文字数\")]]"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名文字数</th>\n",
" <th>患者姓文字数</th>\n",
" <th>患者名文字数</th>\n",
" <th>患者氏名_cat</th>\n",
" <th>患者姓_cat</th>\n",
" <th>患者名_cat</th>\n",
" <th>判定_cat</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>18</td>\n",
" <td>8</td>\n",
" <td>9</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>12</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>16</td>\n",
" <td>7</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>8</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>7</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>12</td>\n",
" <td>12</td>\n",
" <td>13</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>9</td>\n",
" <td>9</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>8</td>\n",
" <td>8</td>\n",
" <td>14</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>13</td>\n",
" <td>13</td>\n",
" <td>9</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>14</td>\n",
" <td>14</td>\n",
" <td>15</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>15</td>\n",
" <td>15</td>\n",
" <td>10</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>11</td>\n",
" <td>11</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名文字数 患者姓文字数 患者名文字数 患者氏名_cat 患者姓_cat 患者名_cat 判定_cat\n",
"0 10 4 5 0 0 3 0\n",
"1 18 8 9 3 3 0 0\n",
"2 12 5 6 1 1 1 0\n",
"3 16 7 8 2 2 2 0\n",
"4 8 4 3 5 5 7 1\n",
"5 7 3 3 4 4 6 1\n",
"6 11 3 7 6 6 5 1\n",
"7 8 2 5 7 7 4 1\n",
"8 5 2 2 12 12 13 2\n",
"9 6 2 3 9 9 8 2\n",
"10 4 2 1 8 8 14 2\n",
"11 5 2 2 13 13 9 2\n",
"12 4 1 2 10 10 11 3\n",
"13 4 1 2 14 14 15 3\n",
"14 4 1 2 15 15 10 3\n",
"15 4 1 2 11 11 12 3"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfd"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X = dfd.iloc[:, :-1].values\n",
"y = dfd.iloc[:, -1].values"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[10, 4, 5, 0, 0, 3],\n",
" [18, 8, 9, 3, 3, 0],\n",
" [12, 5, 6, 1, 1, 1],\n",
" [16, 7, 8, 2, 2, 2],\n",
" [ 8, 4, 3, 5, 5, 7],\n",
" [ 7, 3, 3, 4, 4, 6],\n",
" [11, 3, 7, 6, 6, 5],\n",
" [ 8, 2, 5, 7, 7, 4],\n",
" [ 5, 2, 2, 12, 12, 13],\n",
" [ 6, 2, 3, 9, 9, 8],\n",
" [ 4, 2, 1, 8, 8, 14],\n",
" [ 5, 2, 2, 13, 13, 9],\n",
" [ 4, 1, 2, 10, 10, 11],\n",
" [ 4, 1, 2, 14, 14, 15],\n",
" [ 4, 1, 2, 15, 15, 10],\n",
" [ 4, 1, 2, 11, 11, 12]])"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3])"
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"GaussianNB(priors=None)"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gnb.fit(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"tf = pd.DataFrame({\"患者氏名\": [\n",
"\"Tom Hanks\",\n",
"\"Robert De Niro\",\n",
"\"Gen Hoshino\",\n",
"\"金 正男\",\n",
"\"朴 璐美\",\n",
"\"李 小龍\",\n",
"\"林 彪\",\n",
"\"古歩道 ベンジャミン\",\n",
"\"キム イルソン\",\n",
"\"山下 奉文\",\n",
"\"宮沢 賢治\",\n",
"\"徳川 家康\",\n",
"\"井浦 新\",\n",
"\"窪塚 洋介\",\n",
"\"伊藤 博文\",\n",
"\"近衛 文麿\"\n",
"]\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Tom Hanks</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Robert De Niro</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Gen Hoshino</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>金 正男</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>朴 璐美</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>李 小龍</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>林 彪</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>古歩道 ベンジャミン</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>キム イルソン</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>山下 奉文</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>宮沢 賢治</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>徳川 家康</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>井浦 新</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>窪塚 洋介</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>伊藤 博文</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>近衛 文麿</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名\n",
"0 Tom Hanks\n",
"1 Robert De Niro\n",
"2 Gen Hoshino\n",
"3 金 正男\n",
"4 朴 璐美\n",
"5 李 小龍\n",
"6 林 彪\n",
"7 古歩道 ベンジャミン\n",
"8 キム イルソン\n",
"9 山下 奉文\n",
"10 宮沢 賢治\n",
"11 徳川 家康\n",
"12 井浦 新\n",
"13 窪塚 洋介\n",
"14 伊藤 博文\n",
"15 近衛 文麿"
]
},
"execution_count": 119,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tf"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"tf[\"患者姓\"] = tf[\"患者氏名\"].apply(lambda x: x.split(\" \")[0])\n",
"tf[\"患者名\"] = tf[\"患者氏名\"].apply(lambda x: x.split(\" \")[1])\n",
"tf[\"患者氏名文字数\"] = tf[\"患者氏名\"].apply(lambda x: len(x))\n",
"tf[\"患者姓文字数\"] = tf[\"患者姓\"].apply(lambda x: len(x))\n",
"tf[\"患者名文字数\"] = tf[\"患者名\"].apply(lambda x: len(x))"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名</th>\n",
" <th>患者姓</th>\n",
" <th>患者名</th>\n",
" <th>患者氏名文字数</th>\n",
" <th>患者姓文字数</th>\n",
" <th>患者名文字数</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Tom Hanks</td>\n",
" <td>Tom</td>\n",
" <td>Hanks</td>\n",
" <td>9</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Robert De Niro</td>\n",
" <td>Robert</td>\n",
" <td>De</td>\n",
" <td>14</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Gen Hoshino</td>\n",
" <td>Gen</td>\n",
" <td>Hoshino</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>金 正男</td>\n",
" <td>金</td>\n",
" <td>正男</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>朴 璐美</td>\n",
" <td>朴</td>\n",
" <td>璐美</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>李 小龍</td>\n",
" <td>李</td>\n",
" <td>小龍</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>林 彪</td>\n",
" <td>林</td>\n",
" <td>彪</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>古歩道 ベンジャミン</td>\n",
" <td>古歩道</td>\n",
" <td>ベンジャミン</td>\n",
" <td>10</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>キム イルソン</td>\n",
" <td>キム</td>\n",
" <td>イルソン</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>山下 奉文</td>\n",
" <td>山下</td>\n",
" <td>奉文</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>宮沢 賢治</td>\n",
" <td>宮沢</td>\n",
" <td>賢治</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>徳川 家康</td>\n",
" <td>徳川</td>\n",
" <td>家康</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>井浦 新</td>\n",
" <td>井浦</td>\n",
" <td>新</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>窪塚 洋介</td>\n",
" <td>窪塚</td>\n",
" <td>洋介</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>伊藤 博文</td>\n",
" <td>伊藤</td>\n",
" <td>博文</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>近衛 文麿</td>\n",
" <td>近衛</td>\n",
" <td>文麿</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名 患者姓 患者名 患者氏名文字数 患者姓文字数 患者名文字数\n",
"0 Tom Hanks Tom Hanks 9 3 5\n",
"1 Robert De Niro Robert De 14 6 2\n",
"2 Gen Hoshino Gen Hoshino 11 3 7\n",
"3 金 正男 金 正男 4 1 2\n",
"4 朴 璐美 朴 璐美 4 1 2\n",
"5 李 小龍 李 小龍 4 1 2\n",
"6 林 彪 林 彪 3 1 1\n",
"7 古歩道 ベンジャミン 古歩道 ベンジャミン 10 3 6\n",
"8 キム イルソン キム イルソン 7 2 4\n",
"9 山下 奉文 山下 奉文 5 2 2\n",
"10 宮沢 賢治 宮沢 賢治 5 2 2\n",
"11 徳川 家康 徳川 家康 5 2 2\n",
"12 井浦 新 井浦 新 4 2 1\n",
"13 窪塚 洋介 窪塚 洋介 5 2 2\n",
"14 伊藤 博文 伊藤 博文 5 2 2\n",
"15 近衛 文麿 近衛 文麿 5 2 2"
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tf"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" from ipykernel import kernelapp as app\n",
"/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
"/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
"/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
]
}
],
"source": [
"tf1 = tf[:3]\n",
"tf1[\"判定\"] = \"アルファベット外国人\"\n",
"\n",
"tf2 = tf[3:7]\n",
"tf2[\"判定\"] = \"漢字外国人\"\n",
"\n",
"tf3 = tf[7:9]\n",
"tf3[\"判定\"] = \"カタカナ外国人\"\n",
"\n",
"tf4 = tf[9:]\n",
"tf4[\"判定\"] = \"日本人\"\n",
"\n",
"tflist = [tf1,tf2,tf3,tf4]\n",
"\n",
"tfc = reduce((lambda x,y: pd.concat([x,y])), tflist)"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名</th>\n",
" <th>患者姓</th>\n",
" <th>患者名</th>\n",
" <th>患者氏名文字数</th>\n",
" <th>患者姓文字数</th>\n",
" <th>患者名文字数</th>\n",
" <th>判定</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Tom Hanks</td>\n",
" <td>Tom</td>\n",
" <td>Hanks</td>\n",
" <td>9</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" <td>アルファベット外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Robert De Niro</td>\n",
" <td>Robert</td>\n",
" <td>De</td>\n",
" <td>14</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>アルファベット外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Gen Hoshino</td>\n",
" <td>Gen</td>\n",
" <td>Hoshino</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>アルファベット外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>金 正男</td>\n",
" <td>金</td>\n",
" <td>正男</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>朴 璐美</td>\n",
" <td>朴</td>\n",
" <td>璐美</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>李 小龍</td>\n",
" <td>李</td>\n",
" <td>小龍</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>林 彪</td>\n",
" <td>林</td>\n",
" <td>彪</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>漢字外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>古歩道 ベンジャミン</td>\n",
" <td>古歩道</td>\n",
" <td>ベンジャミン</td>\n",
" <td>10</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>カタカナ外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>キム イルソン</td>\n",
" <td>キム</td>\n",
" <td>イルソン</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>カタカナ外国人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>山下 奉文</td>\n",
" <td>山下</td>\n",
" <td>奉文</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>宮沢 賢治</td>\n",
" <td>宮沢</td>\n",
" <td>賢治</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>徳川 家康</td>\n",
" <td>徳川</td>\n",
" <td>家康</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>井浦 新</td>\n",
" <td>井浦</td>\n",
" <td>新</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>窪塚 洋介</td>\n",
" <td>窪塚</td>\n",
" <td>洋介</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>伊藤 博文</td>\n",
" <td>伊藤</td>\n",
" <td>博文</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>近衛 文麿</td>\n",
" <td>近衛</td>\n",
" <td>文麿</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名 患者姓 患者名 患者氏名文字数 患者姓文字数 患者名文字数 判定\n",
"0 Tom Hanks Tom Hanks 9 3 5 アルファベット外国人\n",
"1 Robert De Niro Robert De 14 6 2 アルファベット外国人\n",
"2 Gen Hoshino Gen Hoshino 11 3 7 アルファベット外国人\n",
"3 金 正男 金 正男 4 1 2 漢字外国人\n",
"4 朴 璐美 朴 璐美 4 1 2 漢字外国人\n",
"5 李 小龍 李 小龍 4 1 2 漢字外国人\n",
"6 林 彪 林 彪 3 1 1 漢字外国人\n",
"7 古歩道 ベンジャミン 古歩道 ベンジャミン 10 3 6 カタカナ外国人\n",
"8 キム イルソン キム イルソン 7 2 4 カタカナ外国人\n",
"9 山下 奉文 山下 奉文 5 2 2 日本人\n",
"10 宮沢 賢治 宮沢 賢治 5 2 2 日本人\n",
"11 徳川 家康 徳川 家康 5 2 2 日本人\n",
"12 井浦 新 井浦 新 4 2 1 日本人\n",
"13 窪塚 洋介 窪塚 洋介 5 2 2 日本人\n",
"14 伊藤 博文 伊藤 博文 5 2 2 日本人\n",
"15 近衛 文麿 近衛 文麿 5 2 2 日本人"
]
},
"execution_count": 123,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfc"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"tfc[\"患者氏名_cat\"] = le.fit_transform(tfc[\"患者氏名\"])\n",
"tfc[\"患者姓_cat\"] = le.fit_transform(tfc[\"患者姓\"])\n",
"tfc[\"患者名_cat\"] = le.fit_transform(tfc[\"患者名\"])\n",
"tfc[\"判定_cat\"] = le.fit_transform(tfc[\"判定\"])"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名</th>\n",
" <th>患者姓</th>\n",
" <th>患者名</th>\n",
" <th>患者氏名文字数</th>\n",
" <th>患者姓文字数</th>\n",
" <th>患者名文字数</th>\n",
" <th>判定</th>\n",
" <th>患者氏名_cat</th>\n",
" <th>患者姓_cat</th>\n",
" <th>患者名_cat</th>\n",
" <th>判定_cat</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Tom Hanks</td>\n",
" <td>Tom</td>\n",
" <td>Hanks</td>\n",
" <td>9</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" <td>アルファベット外国人</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Robert De Niro</td>\n",
" <td>Robert</td>\n",
" <td>De</td>\n",
" <td>14</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>アルファベット外国人</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Gen Hoshino</td>\n",
" <td>Gen</td>\n",
" <td>Hoshino</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>アルファベット外国人</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>金 正男</td>\n",
" <td>金</td>\n",
" <td>正男</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" <td>15</td>\n",
" <td>15</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>朴 璐美</td>\n",
" <td>朴</td>\n",
" <td>璐美</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" <td>14</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>李 小龍</td>\n",
" <td>李</td>\n",
" <td>小龍</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>漢字外国人</td>\n",
" <td>11</td>\n",
" <td>11</td>\n",
" <td>8</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>林 彪</td>\n",
" <td>林</td>\n",
" <td>彪</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>漢字外国人</td>\n",
" <td>12</td>\n",
" <td>12</td>\n",
" <td>9</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>古歩道 ベンジャミン</td>\n",
" <td>古歩道</td>\n",
" <td>ベンジャミン</td>\n",
" <td>10</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>カタカナ外国人</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>キム イルソン</td>\n",
" <td>キム</td>\n",
" <td>イルソン</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>カタカナ外国人</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>山下 奉文</td>\n",
" <td>山下</td>\n",
" <td>奉文</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" <td>8</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>宮沢 賢治</td>\n",
" <td>宮沢</td>\n",
" <td>賢治</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>15</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>徳川 家康</td>\n",
" <td>徳川</td>\n",
" <td>家康</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" <td>9</td>\n",
" <td>9</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>井浦 新</td>\n",
" <td>井浦</td>\n",
" <td>新</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>日本人</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>11</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>窪塚 洋介</td>\n",
" <td>窪塚</td>\n",
" <td>洋介</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" <td>13</td>\n",
" <td>13</td>\n",
" <td>13</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>伊藤 博文</td>\n",
" <td>伊藤</td>\n",
" <td>博文</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>近衛 文麿</td>\n",
" <td>近衛</td>\n",
" <td>文麿</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>日本人</td>\n",
" <td>14</td>\n",
" <td>14</td>\n",
" <td>10</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名 患者姓 患者名 患者氏名文字数 患者姓文字数 患者名文字数 判定 \\\n",
"0 Tom Hanks Tom Hanks 9 3 5 アルファベット外国人 \n",
"1 Robert De Niro Robert De 14 6 2 アルファベット外国人 \n",
"2 Gen Hoshino Gen Hoshino 11 3 7 アルファベット外国人 \n",
"3 金 正男 金 正男 4 1 2 漢字外国人 \n",
"4 朴 璐美 朴 璐美 4 1 2 漢字外国人 \n",
"5 李 小龍 李 小龍 4 1 2 漢字外国人 \n",
"6 林 彪 林 彪 3 1 1 漢字外国人 \n",
"7 古歩道 ベンジャミン 古歩道 ベンジャミン 10 3 6 カタカナ外国人 \n",
"8 キム イルソン キム イルソン 7 2 4 カタカナ外国人 \n",
"9 山下 奉文 山下 奉文 5 2 2 日本人 \n",
"10 宮沢 賢治 宮沢 賢治 5 2 2 日本人 \n",
"11 徳川 家康 徳川 家康 5 2 2 日本人 \n",
"12 井浦 新 井浦 新 4 2 1 日本人 \n",
"13 窪塚 洋介 窪塚 洋介 5 2 2 日本人 \n",
"14 伊藤 博文 伊藤 博文 5 2 2 日本人 \n",
"15 近衛 文麿 近衛 文麿 5 2 2 日本人 \n",
"\n",
" 患者氏名_cat 患者姓_cat 患者名_cat 判定_cat \n",
"0 2 2 1 0 \n",
"1 1 1 0 0 \n",
"2 0 0 2 0 \n",
"3 15 15 12 3 \n",
"4 10 10 14 3 \n",
"5 11 11 8 3 \n",
"6 12 12 9 3 \n",
"7 6 6 4 1 \n",
"8 3 3 3 1 \n",
"9 8 8 6 2 \n",
"10 7 7 15 2 \n",
"11 9 9 7 2 \n",
"12 4 4 11 2 \n",
"13 13 13 13 2 \n",
"14 5 5 5 2 \n",
"15 14 14 10 2 "
]
},
"execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfc"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"tfd = tfc[tfc.columns[tfc.columns.str.contains(\"cat\")|tfc.columns.str.contains(\"文字数\")]]"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名文字数</th>\n",
" <th>患者姓文字数</th>\n",
" <th>患者名文字数</th>\n",
" <th>患者氏名_cat</th>\n",
" <th>患者姓_cat</th>\n",
" <th>患者名_cat</th>\n",
" <th>判定_cat</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>9</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>14</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>15</td>\n",
" <td>15</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" <td>14</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>11</td>\n",
" <td>11</td>\n",
" <td>8</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>12</td>\n",
" <td>9</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>10</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>8</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>15</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>9</td>\n",
" <td>9</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>11</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>13</td>\n",
" <td>13</td>\n",
" <td>13</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>14</td>\n",
" <td>14</td>\n",
" <td>10</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名文字数 患者姓文字数 患者名文字数 患者氏名_cat 患者姓_cat 患者名_cat 判定_cat\n",
"0 9 3 5 2 2 1 0\n",
"1 14 6 2 1 1 0 0\n",
"2 11 3 7 0 0 2 0\n",
"3 4 1 2 15 15 12 3\n",
"4 4 1 2 10 10 14 3\n",
"5 4 1 2 11 11 8 3\n",
"6 3 1 1 12 12 9 3\n",
"7 10 3 6 6 6 4 1\n",
"8 7 2 4 3 3 3 1\n",
"9 5 2 2 8 8 6 2\n",
"10 5 2 2 7 7 15 2\n",
"11 5 2 2 9 9 7 2\n",
"12 4 2 1 4 4 11 2\n",
"13 5 2 2 13 13 13 2\n",
"14 5 2 2 5 5 5 2\n",
"15 5 2 2 14 14 10 2"
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfd"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X_test = tfd.iloc[:, :-1].values\n",
"y_test = tfd.iloc[:, -1].values"
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 9, 3, 5, 2, 2, 1],\n",
" [14, 6, 2, 1, 1, 0],\n",
" [11, 3, 7, 0, 0, 2],\n",
" [ 4, 1, 2, 15, 15, 12],\n",
" [ 4, 1, 2, 10, 10, 14],\n",
" [ 4, 1, 2, 11, 11, 8],\n",
" [ 3, 1, 1, 12, 12, 9],\n",
" [10, 3, 6, 6, 6, 4],\n",
" [ 7, 2, 4, 3, 3, 3],\n",
" [ 5, 2, 2, 8, 8, 6],\n",
" [ 5, 2, 2, 7, 7, 15],\n",
" [ 5, 2, 2, 9, 9, 7],\n",
" [ 4, 2, 1, 4, 4, 11],\n",
" [ 5, 2, 2, 13, 13, 13],\n",
" [ 5, 2, 2, 5, 5, 5],\n",
" [ 5, 2, 2, 14, 14, 10]])"
]
},
"execution_count": 134,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 0, 0, 3, 3, 3, 3, 1, 1, 2, 2, 2, 2, 2, 2, 2])"
]
},
"execution_count": 135,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_test"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 0, 0, 3, 3, 3, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2])"
]
},
"execution_count": 137,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gnb.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.9375"
]
},
"execution_count": 136,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gnb.score(X_test, y_test)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment