Skip to content

Instantly share code, notes, and snippets.

@riow1983
Created September 24, 2017 05:18
Show Gist options
  • Save riow1983/e0b36c731235ef85a0efb5c028ee7b9c to your computer and use it in GitHub Desktop.
Save riow1983/e0b36c731235ef85a0efb5c028ee7b9c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 192,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 193,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 194,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.naive_bayes import GaussianNB\n",
"gnb = GaussianNB()"
]
},
{
"cell_type": "code",
"execution_count": 195,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn import preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 196,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 197,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# 患者氏名を文字数とord()に分解し、GaussianNB()に読ませて外国人か日本人かの判定をさせる"
]
},
{
"cell_type": "code",
"execution_count": 198,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = pd.DataFrame({\"患者氏名\":[\"Adam Smith\",\n",
"\"Napoleon Bonaparte\",\n",
"\"Adolf Hitler\",\n",
"\"Gabriel Lippmann\",\n",
"\"トーマス ベイズ\",\n",
"\"カール ハイド\",\n",
"\"マーク ザッカーバーグ\",\n",
"\"リー クワンユー\",\n",
"\"湯川 秀樹\",\n",
"\"朝永 振一郎\",\n",
"\"小林 誠\",\n",
"\"益川 敏英\",\n",
"\"毛 沢東\",\n",
"\"習 近平\",\n",
"\"金 日成\",\n",
"\"江 沢民\",\n",
"\"Tom Hanks\",\n",
"\"Robert De Niro\",\n",
"\"Gen Hoshino\",\n",
"\"金 正男\",\n",
"\"朴 璐美\",\n",
"\"李 小龍\",\n",
"\"林 彪\",\n",
"\"古歩道 ベンジャミン\",\n",
"\"キム イルソン\",\n",
"\"山下 奉文\",\n",
"\"宮沢 賢治\",\n",
"\"徳川 家康\",\n",
"\"井浦 新\",\n",
"\"窪塚 洋介\",\n",
"\"伊藤 博文\",\n",
"\"近衛 文麿\"\n",
"]\n",
"})\n",
"\n",
"\n",
"df[\"患者姓\"] = df[\"患者氏名\"].apply(lambda x: x.split(\" \")[0])\n",
"df[\"患者名\"] = df[\"患者氏名\"].apply(lambda x: x.split(\" \")[1])\n",
"\n",
"df[\"患者氏名文字数\"] = df[\"患者氏名\"].apply(lambda x: len(x))\n",
"df[\"患者姓文字数\"] = df[\"患者姓\"].apply(lambda x: len(x))\n",
"df[\"患者名文字数\"] = df[\"患者名\"].apply(lambda x: len(x))\n",
"\n",
"df[\"患者氏名\"] = df[\"患者氏名\"].apply(lambda x: x.ljust(50))\n",
"\n",
"df.loc[:4, \"判定\"] = \"アルファベット外国人\"\n",
"df.loc[4:8, \"判定\"] = \"カタカナ外国人\"\n",
"df.loc[8:12, \"判定\"] = \"日本人\"\n",
"df.loc[12:15, \"判定\"] = \"漢字外国人\"\n",
"df.loc[15:18, \"判定\"] = \"アルファベット外国人\"\n",
"df.loc[18:22, \"判定\"] = \"漢字外国人\"\n",
"df.loc[22:24, \"判定\"] = \"カタカナ外国人\"\n",
"df.loc[25:, \"判定\"] = \"日本人\""
]
},
{
"cell_type": "code",
"execution_count": 199,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名</th>\n",
" <th>患者氏名文字数</th>\n",
" <th>患者姓文字数</th>\n",
" <th>患者名文字数</th>\n",
" <th>判定_cat</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Adam Smith ...</td>\n",
" <td>10</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Napoleon Bonaparte ...</td>\n",
" <td>18</td>\n",
" <td>8</td>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Adolf Hitler ...</td>\n",
" <td>12</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Gabriel Lippmann ...</td>\n",
" <td>16</td>\n",
" <td>7</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>トーマス ベイズ ...</td>\n",
" <td>8</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>カール ハイド ...</td>\n",
" <td>7</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>マーク ザッカーバーグ ...</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>リー クワンユー ...</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>湯川 秀樹 ...</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>朝永 振一郎 ...</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>小林 誠 ...</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>益川 敏英 ...</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>毛 沢東 ...</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>習 近平 ...</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>金 日成 ...</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>江 沢民 ...</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Tom Hanks ...</td>\n",
" <td>9</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Robert De Niro ...</td>\n",
" <td>14</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>Gen Hoshino ...</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>金 正男 ...</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>朴 璐美 ...</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>李 小龍 ...</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>林 彪 ...</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>古歩道 ベンジャミン ...</td>\n",
" <td>10</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>キム イルソン ...</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>山下 奉文 ...</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>宮沢 賢治 ...</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>徳川 家康 ...</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>井浦 新 ...</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>窪塚 洋介 ...</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>伊藤 博文 ...</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>近衛 文麿 ...</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名 患者氏名文字数 患者姓文字数 \\\n",
"0 Adam Smith ... 10 4 \n",
"1 Napoleon Bonaparte ... 18 8 \n",
"2 Adolf Hitler ... 12 5 \n",
"3 Gabriel Lippmann ... 16 7 \n",
"4 トーマス ベイズ ... 8 4 \n",
"5 カール ハイド ... 7 3 \n",
"6 マーク ザッカーバーグ ... 11 3 \n",
"7 リー クワンユー ... 8 2 \n",
"8 湯川 秀樹 ... 5 2 \n",
"9 朝永 振一郎 ... 6 2 \n",
"10 小林 誠 ... 4 2 \n",
"11 益川 敏英 ... 5 2 \n",
"12 毛 沢東 ... 4 1 \n",
"13 習 近平 ... 4 1 \n",
"14 金 日成 ... 4 1 \n",
"15 江 沢民 ... 4 1 \n",
"16 Tom Hanks ... 9 3 \n",
"17 Robert De Niro ... 14 6 \n",
"18 Gen Hoshino ... 11 3 \n",
"19 金 正男 ... 4 1 \n",
"20 朴 璐美 ... 4 1 \n",
"21 李 小龍 ... 4 1 \n",
"22 林 彪 ... 3 1 \n",
"23 古歩道 ベンジャミン ... 10 3 \n",
"24 キム イルソン ... 7 2 \n",
"25 山下 奉文 ... 5 2 \n",
"26 宮沢 賢治 ... 5 2 \n",
"27 徳川 家康 ... 5 2 \n",
"28 井浦 新 ... 4 2 \n",
"29 窪塚 洋介 ... 5 2 \n",
"30 伊藤 博文 ... 5 2 \n",
"31 近衛 文麿 ... 5 2 \n",
"\n",
" 患者名文字数 判定_cat \n",
"0 5 0 \n",
"1 9 0 \n",
"2 6 0 \n",
"3 8 0 \n",
"4 3 1 \n",
"5 3 1 \n",
"6 7 1 \n",
"7 5 1 \n",
"8 2 2 \n",
"9 3 2 \n",
"10 1 2 \n",
"11 2 2 \n",
"12 2 3 \n",
"13 2 3 \n",
"14 2 3 \n",
"15 2 0 \n",
"16 5 0 \n",
"17 2 0 \n",
"18 7 3 \n",
"19 2 3 \n",
"20 2 3 \n",
"21 2 3 \n",
"22 1 1 \n",
"23 6 1 \n",
"24 4 1 \n",
"25 2 2 \n",
"26 2 2 \n",
"27 2 2 \n",
"28 1 2 \n",
"29 2 2 \n",
"30 2 2 \n",
"31 2 2 "
]
},
"execution_count": 199,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"le = preprocessing.LabelEncoder()\n",
"\n",
"df[\"判定_cat\"] = le.fit_transform(df[\"判定\"])\n",
"\n",
"df.drop([\"患者姓\",\"患者名\",\"判定\"],1,inplace=True)\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 200,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:, -1])"
]
},
{
"cell_type": "code",
"execution_count": 201,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"24"
]
},
"execution_count": 201,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(X_train)"
]
},
{
"cell_type": "code",
"execution_count": 202,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"8"
]
},
"execution_count": 202,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 203,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"31 2\n",
"4 1\n",
"21 3\n",
"2 0\n",
"0 0\n",
"1 0\n",
"10 2\n",
"3 0\n",
"6 1\n",
"29 2\n",
"16 0\n",
"18 3\n",
"17 0\n",
"19 3\n",
"27 2\n",
"26 2\n",
"22 1\n",
"13 3\n",
"5 1\n",
"14 3\n",
"11 2\n",
"23 1\n",
"20 3\n",
"8 2\n",
"Name: 判定_cat, dtype: int64"
]
},
"execution_count": 203,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_train"
]
},
{
"cell_type": "code",
"execution_count": 204,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"30 2\n",
"28 2\n",
"25 2\n",
"24 1\n",
"15 0\n",
"9 2\n",
"7 1\n",
"12 3\n",
"Name: 判定_cat, dtype: int64"
]
},
"execution_count": 204,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_test"
]
},
{
"cell_type": "code",
"execution_count": 205,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"vec_train = X_train[\"患者氏名\"].apply(lambda x: [ord(char) for char in x ]).values\n",
"vec_test = X_test[\"患者氏名\"].apply(lambda x: [ord(char) for char in x ]).values"
]
},
{
"cell_type": "code",
"execution_count": 206,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X_train.drop(\"患者氏名\",1,inplace=True)\n",
"X_test.drop(\"患者氏名\",1,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 207,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>患者氏名文字数</th>\n",
" <th>患者姓文字数</th>\n",
" <th>患者名文字数</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>8</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>12</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 患者氏名文字数 患者姓文字数 患者名文字数\n",
"31 5 2 2\n",
"4 8 4 3\n",
"21 4 1 2\n",
"2 12 5 6\n",
"0 10 4 5"
]
},
"execution_count": 207,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train.head()"
]
},
{
"cell_type": "code",
"execution_count": 208,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"tmp_train = X_train.values\n",
"tmp_test = X_test.values"
]
},
{
"cell_type": "code",
"execution_count": 209,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X_train_vec = np.array([np.append(x,y) for x,y in zip(tmp_train, vec_train)])\n",
"X_test_vec = np.array([np.append(x,y) for x,y in zip(tmp_test, vec_test)])"
]
},
{
"cell_type": "code",
"execution_count": 210,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"y_train_vec = y_train.values\n",
"y_test_vec = y_test.values"
]
},
{
"cell_type": "code",
"execution_count": 211,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"GaussianNB(priors=None)"
]
},
"execution_count": 211,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gnb.fit(X_train_vec, y_train_vec)"
]
},
{
"cell_type": "code",
"execution_count": 212,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.75"
]
},
"execution_count": 212,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gnb.score(X_test_vec, y_test_vec)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment